diff --git a/.gitignore b/.gitignore
index 39635d7eefbe7..3d178992123da 100644
--- a/.gitignore
+++ b/.gitignore
@@ -44,4 +44,4 @@ derby.log
 dist/
 spark-*-bin.tar.gz
 unit-tests.log
-lib/
+/lib/
diff --git a/CHANGES.txt b/CHANGES.txt
new file mode 100644
index 0000000000000..8c78d55ccd862
--- /dev/null
+++ b/CHANGES.txt
@@ -0,0 +1,3047 @@
+Spark Change Log
+
+Release 0.9.0-incubating
+
+  d0a105d Thu Jan 23 20:53:31 2014 -0800
+  Merge pull request #505 from JoshRosen/SPARK-1026
+  [Deprecate mapPartitionsWithSplit in PySpark (SPARK-1026)]
+
+  e66d4c2 Thu Jan 23 19:47:16 2014 -0800
+  Merge pull request #503 from pwendell/master
+  [Fix bug on read-side of external sort when using Snappy.]
+
+  e8d3f2b Thu Jan 23 19:20:22 2014 -0800
+  Merge pull request #502 from pwendell/clone-1
+  [Remove Hadoop object cloning and warn users making Hadoop RDD's.]
+
+  7a62353 Thu Jan 23 19:09:25 2014 -0800
+  Merge pull request #501 from JoshRosen/cartesian-rdd-fixes
+  [Fix two bugs in PySpark cartesian(): SPARK-978 and SPARK-1034]
+
+  51960b8 Wed Jan 22 19:37:50 2014 -0800
+  Merge pull request #496 from pwendell/master
+  [Fix bug in worker clean-up in UI]
+
+  828f7b4 Wed Jan 22 15:45:18 2014 -0800
+  Merge pull request #495 from srowen/GraphXCommonsMathDependency
+  [Fix graphx Commons Math dependency]
+
+  dc5857a Wed Jan 22 14:33:25 2014 -0800
+  Merge pull request #492 from skicavs/master
+  [fixed job name and usage information for the JavaSparkPi example]
+
+  dd533c9 Wed Jan 22 14:15:58 2014 -0800
+  Merge pull request #478 from sryza/sandy-spark-1033
+  [SPARK-1033. Ask for cores in Yarn container requests]
+
+  b6fd3cd Tue Jan 21 00:12:01 2014 -0800
+  Merge pull request #480 from pwendell/0.9-fixes
+  [Handful of 0.9 fixes]
+
+  e5f8917 Mon Jan 20 23:35:07 2014 -0800
+  Merge pull request #484 from tdas/run-example-fix
+  [Made run-example respect SPARK_JAVA_OPTS and SPARK_MEM.]
+
+  410ba06 Mon Jan 20 22:26:14 2014 -0800
+  Merge pull request #482 from tdas/streaming-example-fix
+  [Added StreamingContext.awaitTermination to streaming examples]
+
+  f137947 Mon Jan 20 22:24:07 2014 -0800
+  Merge pull request #483 from pwendell/gitignore
+  [Restricting /lib to top level directory in .gitignore]
+
+  94ae25d Sun Jan 19 11:33:51 2014 -0800
+  Merge pull request #470 from tgravescs/fix_spark_examples_yarn
+  [Only log error on missing jar to allow spark examples to jar.]
+
+  0f077b5 Sun Jan 19 10:30:29 2014 -0800
+  Merge pull request #458 from tdas/docs-update
+  [Updated java API docs for streaming, along with very minor changes in the code examples.]
+
+  03019d1 Sat Jan 18 16:29:43 2014 -0800
+  Merge pull request #459 from srowen/UpdaterL2Regularization
+  [Correct L2 regularized weight update with canonical form]
+
+  76147a2 Sat Jan 18 16:24:16 2014 -0800
+  Merge pull request #437 from mridulm/master
+  [Minor api usability changes]
+
+  4ac8cab Sat Jan 18 16:22:46 2014 -0800
+  Merge pull request #426 from mateiz/py-ml-tests
+  [Re-enable Python MLlib tests (require Python 2.7 and NumPy 1.7+)]
+
+  34e911c Sat Jan 18 16:17:34 2014 -0800
+  Merge pull request #462 from mateiz/conf-file-fix
+  [Remove Typesafe Config usage and conf files to fix nested property names]
+
+  ff7201c Sat Jan 18 12:50:02 2014 -0800
+  Merge pull request #461 from pwendell/master
+  [Use renamed shuffle spill config in CoGroupedRDD.scala]
+
+  7b0d5a5 Thu Jan 16 23:18:48 2014 -0800
+  Merge pull request #451 from Qiuzhuang/master
+  [Fixed Window spark shell launch script error.]
+
+  4ccedb3 Wed Jan 15 14:26:48 2014 -0800
+  Merge pull request #444 from mateiz/py-version
+  [Clarify that Python 2.7 is only needed for MLlib]
+
+  e3fa36f Wed Jan 15 13:56:04 2014 -0800
+  Merge pull request #442 from pwendell/standalone
+  [Workers should use working directory as spark home if it's not specified]
+
+  29c76d9 Wed Jan 15 13:55:48 2014 -0800
+  Merge pull request #443 from tdas/filestream-fix
+  [Made some classes private[stremaing] and deprecated a method in JavaStreamingContext.]
+
+  aca40aa Wed Jan 15 11:15:47 2014 -0800
+  Merge pull request #441 from pwendell/graphx-build
+  [GraphX shouldn't list Spark as provided.]
+
+  e12c374 Wed Jan 15 10:01:43 2014 -0800
+  Merge pull request #433 from markhamstra/debFix
+  [Updated Debian packaging]
+
+  2f015c2 Tue Jan 14 23:17:28 2014 -0800
+  Merge pull request #436 from ankurdave/VertexId-case
+  [Rename VertexID -> VertexId in GraphX]
+
+  2859cab Tue Jan 14 23:08:19 2014 -0800
+  Merge pull request #435 from tdas/filestream-fix
+  [Fixed the flaky tests by making SparkConf not serializable]
+
+  fbfbb33 Tue Jan 14 23:06:29 2014 -0800
+  Merge pull request #434 from rxin/graphxmaven
+  [Fixed SVDPlusPlusSuite in Maven build.]
+
+  2c6c07f Tue Jan 14 21:53:05 2014 -0800
+  Merge pull request #424 from jegonzal/GraphXProgrammingGuide
+  [Additional edits for clarity in the graphx programming guide.]
+
+  6fa4e02 Tue Jan 14 21:51:25 2014 -0800
+  Merge pull request #431 from ankurdave/graphx-caching-doc
+  [Describe caching and uncaching in GraphX programming guide]
+
+  2f930d5 Tue Jan 14 15:00:11 2014 -0800
+  Merge pull request #428 from pwendell/writeable-objects
+  [Don't clone records for text files]
+
+  329c9df Tue Jan 14 14:53:36 2014 -0800
+  Merge pull request #429 from ankurdave/graphx-examples-pom.xml
+  [Add GraphX dependency to examples/pom.xml]
+
+  a14933d Tue Jan 14 14:52:42 2014 -0800
+  Merge pull request #427 from pwendell/deprecate-aggregator
+  [Deprecate rather than remove old combineValuesByKey function]
+
+  119b6c5 Tue Jan 14 13:29:08 2014 -0800
+  Merge pull request #425 from rxin/scaladoc
+  [API doc update & make Broadcast public]
+
+  bf3b150 Tue Jan 14 09:45:22 2014 -0800
+  Merge pull request #423 from jegonzal/GraphXProgrammingGuide
+  [Improving the graphx-programming-guide]
+
+  1b4adc2 Tue Jan 14 01:19:24 2014 -0800
+  Merge pull request #420 from pwendell/header-files
+  [Add missing header files]
+
+  b60840e Tue Jan 14 00:48:34 2014 -0800
+  Merge pull request #418 from pwendell/0.9-versions
+  [Version changes for release 0.9.0.]
+
+  980250b Tue Jan 14 00:05:37 2014 -0800
+  Merge pull request #416 from tdas/filestream-fix
+  [Removed unnecessary DStream operations and updated docs]
+
+  055be5c Mon Jan 13 23:26:44 2014 -0800
+  Merge pull request #415 from pwendell/shuffle-compress
+  [Enable compression by default for spills]
+
+  fdaabdc Mon Jan 13 23:08:26 2014 -0800
+  Merge pull request #380 from mateiz/py-bayes
+  [Add Naive Bayes to Python MLlib, and some API fixes]
+
+  4a805af Mon Jan 13 22:58:38 2014 -0800
+  Merge pull request #367 from ankurdave/graphx
+  [GraphX: Unifying Graphs and Tables]
+
+  945fe7a Mon Jan 13 22:56:12 2014 -0800
+  Merge pull request #408 from pwendell/external-serializers
+  [Improvements to external sorting]
+
+  68641bc Mon Jan 13 22:54:13 2014 -0800
+  Merge pull request #413 from rxin/scaladoc
+  [Adjusted visibility of various components and documentation for 0.9.0 release.]
+
+  0ca0d4d Mon Jan 13 22:32:21 2014 -0800
+  Merge pull request #401 from andrewor14/master
+  [External sorting - Add number of bytes spilled to Web UI]
+
+  08b9fec Mon Jan 13 22:29:03 2014 -0800
+  Merge pull request #409 from tdas/unpersist
+  [Automatically unpersisting RDDs that have been cleaned up from DStreams]
+
+  b07bc02 Mon Jan 13 20:45:22 2014 -0800
+  Merge pull request #412 from harveyfeng/master
+  [Add default value for HadoopRDD's `cloneRecords` constructor arg]
+
+  a2fee38 Mon Jan 13 19:45:26 2014 -0800
+  Merge pull request #411 from tdas/filestream-fix
+  [Improved logic of finding new files in FileInputDStream]
+
+  01c0d72 Mon Jan 13 16:24:30 2014 -0800
+  Merge pull request #410 from rxin/scaladoc1
+  [Updated JavaStreamingContext to make scaladoc compile.]
+
+  8038da2 Mon Jan 13 14:59:30 2014 -0800
+  Merge pull request #2 from jegonzal/GraphXCCIssue
+  [Improving documentation and identifying potential bug in CC calculation.]
+
+  b93f9d4 Mon Jan 13 12:18:05 2014 -0800
+  Merge pull request #400 from tdas/dstream-move
+  [Moved DStream and PairDSream to org.apache.spark.streaming.dstream]
+
+  e6ed13f Sun Jan 12 22:35:14 2014 -0800
+  Merge pull request #397 from pwendell/host-port
+  [Remove now un-needed hostPort option]
+
+  0b96d85 Sun Jan 12 21:31:43 2014 -0800
+  Merge pull request #399 from pwendell/consolidate-off
+  [Disable shuffle file consolidation by default]
+
+  0ab505a Sun Jan 12 21:31:04 2014 -0800
+  Merge pull request #395 from hsaputra/remove_simpleredundantreturn_scala
+  [Remove simple redundant return statements for Scala methods/functions]
+
+  405bfe8 Sun Jan 12 20:04:21 2014 -0800
+  Merge pull request #394 from tdas/error-handling
+  [Better error handling in Spark Streaming and more API cleanup]
+
+  28a6b0c Sun Jan 12 19:49:36 2014 -0800
+  Merge pull request #398 from pwendell/streaming-api
+  [Rename DStream.foreach to DStream.foreachRDD]
+
+  074f502 Sun Jan 12 17:01:13 2014 -0800
+  Merge pull request #396 from pwendell/executor-env
+  [Setting load defaults to true in executor]
+
+  82e2b92 Sun Jan 12 16:55:11 2014 -0800
+  Merge pull request #392 from rxin/listenerbus
+  [Stop SparkListenerBus daemon thread when DAGScheduler is stopped.]
+
+  288a878 Sat Jan 11 21:53:19 2014 -0800
+  Merge pull request #389 from rxin/clone-writables
+  [Minor update for clone writables and more documentation.]
+
+  dbc11df Sat Jan 11 18:07:13 2014 -0800
+  Merge pull request #388 from pwendell/master
+  [Fix UI bug introduced in #244.]
+
+  409866b Sat Jan 11 17:12:06 2014 -0800
+  Merge pull request #393 from pwendell/revert-381
+  [Revert PR 381]
+
+  6510f04 Sat Jan 11 12:48:26 2014 -0800
+  Merge pull request #387 from jerryshao/conf-fix
+  [Fix configure didn't work small problem in ALS]
+
+  ee6e7f9 Sat Jan 11 12:07:55 2014 -0800
+  Merge pull request #359 from ScrapCodes/clone-writables
+  [We clone hadoop key and values by default and reuse objects if asked to.]
+
+  4216178 Sat Jan 11 09:46:48 2014 -0800
+  Merge pull request #373 from jerryshao/kafka-upgrade
+  [Upgrade Kafka dependecy to 0.8.0 release version]
+
+  92ad18b Fri Jan 10 23:25:15 2014 -0800
+  Merge pull request #376 from prabeesh/master
+  [Change clientId to random clientId]
+
+  0b5ce7a Fri Jan 10 23:23:21 2014 -0800
+  Merge pull request #386 from pwendell/typo-fix
+  [Small typo fix]
+
+  1d7bef0 Fri Jan 10 18:53:03 2014 -0800
+  Merge pull request #381 from mateiz/default-ttl
+  [Fix default TTL for metadata cleaner]
+
+  44d6a8e Fri Jan 10 17:51:50 2014 -0800
+  Merge pull request #382 from RongGu/master
+  [Fix a type error in comment lines]
+
+  88faa30 Fri Jan 10 17:14:22 2014 -0800
+  Merge pull request #385 from shivaram/add-i2-instances
+  [Add i2 instance types to Spark EC2.]
+
+  f265531 Fri Jan 10 16:25:44 2014 -0800
+  Merge pull request #383 from tdas/driver-test
+  [API for automatic driver recovery for streaming programs and other bug fixes]
+
+  d37408f Fri Jan 10 16:25:01 2014 -0800
+  Merge pull request #377 from andrewor14/master
+  [External Sorting for Aggregator and CoGroupedRDDs (Revisited)]
+
+  0eaf01c Fri Jan 10 15:32:19 2014 -0800
+  Merge pull request #369 from pillis/master
+  [SPARK-961 Add a Vector.random() method]
+
+  7cef843 Fri Jan 10 15:34:15 2014 -0600
+  Merge pull request #371 from tgravescs/yarn_client_addjar_misc_fixes
+  [Yarn client addjar and misc fixes]
+
+  7b58f11 Fri Jan 10 12:47:46 2014 -0800
+  Merge pull request #384 from pwendell/debug-logs
+  [Make DEBUG-level logs consummable.]
+
+  23d2995 Fri Jan 10 10:20:02 2014 -0800
+  Merge pull request #1 from jegonzal/graphx
+  [ProgrammingGuide]
+
+  0ebc973 Thu Jan 9 23:58:49 2014 -0800
+  Merge pull request #375 from mateiz/option-fix
+  [Fix bug added when we changed AppDescription.maxCores to an Option]
+
+  dd03cea Thu Jan 9 23:38:03 2014 -0800
+  Merge pull request #378 from pwendell/consolidate_on
+  [Enable shuffle consolidation by default.]
+
+  997c830 Thu Jan 9 22:22:20 2014 -0800
+  Merge pull request #363 from pwendell/streaming-logs
+  [Set default logging to WARN for Spark streaming examples.]
+
+  300eaa9 Thu Jan 9 20:29:51 2014 -0800
+  Merge pull request #353 from pwendell/ipython-simplify
+  [Simplify and fix pyspark script.]
+
+  4b074fa Thu Jan 9 19:03:55 2014 -0800
+  Merge pull request #374 from mateiz/completeness
+  [Add some missing Java API methods]
+
+  a9d5333 Thu Jan 9 18:46:46 2014 -0800
+  Merge pull request #294 from RongGu/master
+  [Bug fixes for updating the RDD block's memory and disk usage information]
+
+  d86a85e Thu Jan 9 18:37:52 2014 -0800
+  Merge pull request #293 from pwendell/standalone-driver
+  [SPARK-998: Support Launching Driver Inside of Standalone Mode]
+
+  26cdb5f Thu Jan 9 17:16:34 2014 -0800
+  Merge pull request #372 from pwendell/log4j-fix-1
+  [Send logs to stderr by default (instead of stdout).]
+
+  12f414e Thu Jan 9 15:31:30 2014 -0800
+  Merge pull request #362 from mateiz/conf-getters
+  [Use typed getters for configuration settings]
+
+  365cac9 Thu Jan 9 00:56:16 2014 -0800
+  Merge pull request #361 from rxin/clean
+  [Minor style cleanup. Mostly on indenting & line width changes.]
+
+  73c724e Thu Jan 9 00:32:19 2014 -0800
+  Merge pull request #368 from pwendell/sbt-fix
+  [Don't delegate to users `sbt`.]
+
+  dceedb4 Wed Jan 8 23:19:28 2014 -0800
+  Merge pull request #364 from pwendell/fix
+  [Fixing config option "retained_stages" => "retainedStages".]
+
+  04d83fc Wed Jan 8 11:55:37 2014 -0800
+  Merge pull request #360 from witgo/master
+  [fix make-distribution.sh show version: command not found]
+
+  56ebfea Wed Jan 8 11:50:06 2014 -0800
+  Merge pull request #357 from hsaputra/set_boolean_paramname
+  [Set boolean param name for call to SparkHadoopMapReduceUtil.newTaskAttemptID]
+
+  bdeaeaf Wed Jan 8 11:48:39 2014 -0800
+  Merge pull request #358 from pwendell/add-cdh
+  [Add CDH Repository to Maven Build]
+
+  5cae05f Wed Jan 8 11:47:28 2014 -0800
+  Merge pull request #356 from hsaputra/remove_deprecated_cleanup_method
+  [Remove calls to deprecated mapred's OutputCommitter.cleanupJob]
+
+  6eef78d Wed Jan 8 08:49:20 2014 -0600
+  Merge pull request #345 from colorant/yarn
+  [support distributing extra files to worker for yarn client mode]
+
+  bb6a39a Tue Jan 7 22:32:18 2014 -0800
+  Merge pull request #322 from falaki/MLLibDocumentationImprovement
+  [SPARK-1009 Updated MLlib docs to show how to use it in Python]
+
+  cb1b927 Tue Jan 7 22:26:28 2014 -0800
+  Merge pull request #355 from ScrapCodes/patch-1
+  [Update README.md]
+
+  c0f0155 Tue Jan 7 22:21:52 2014 -0800
+  Merge pull request #313 from tdas/project-refactor
+  [Refactored the streaming project to separate external libraries like Twitter, Kafka, Flume, etc.]
+
+  f5f12dc Tue Jan 7 21:56:35 2014 -0800
+  Merge pull request #336 from liancheng/akka-remote-lookup
+  [Get rid of `Either[ActorRef, ActorSelection]']
+
+  11891e6 Wed Jan 8 00:32:18 2014 -0500
+  Merge pull request #327 from lucarosellini/master
+  [Added ‘-i’ command line option to Spark REPL]
+
+  7d0aac9 Wed Jan 8 00:30:45 2014 -0500
+  Merge pull request #354 from hsaputra/addasfheadertosbt
+  [Add ASF header to the new sbt script.]
+
+  d75dc42 Wed Jan 8 00:30:03 2014 -0500
+  Merge pull request #350 from mateiz/standalone-limit
+  [Add way to limit default # of cores used by apps in standalone mode]
+
+  61674bc Tue Jan 7 18:32:13 2014 -0800
+  Merge pull request #352 from markhamstra/oldArch
+  [Don't leave os.arch unset after BlockManagerSuite]
+
+  b2e690f Tue Jan 7 16:57:08 2014 -0800
+  Merge pull request #328 from falaki/MatrixFactorizationModel-fix
+  [SPARK-1012: DAGScheduler Exception Fix]
+
+  6ccf8ce Tue Jan 7 15:49:14 2014 -0800
+  Merge pull request #351 from pwendell/maven-fix
+  [Add log4j exclusion rule to maven.]
+
+  7d5fa17 Tue Jan 7 11:31:34 2014 -0800
+  Merge pull request #337 from yinxusen/mllib-16-bugfix
+  [Mllib 16 bugfix]
+
+  71fc113 Tue Jan 7 11:30:35 2014 -0800
+  Merge pull request #349 from CodingCat/support-worker_dir
+  [add the comments about SPARK_WORKER_DIR]
+
+  15d9534 Tue Jan 7 08:10:02 2014 -0800
+  Merge pull request #318 from srowen/master
+  [Suggested small changes to Java code for slightly more standard style, encapsulation and in some cases performance]
+
+  468af0f Tue Jan 7 08:09:01 2014 -0800
+  Merge pull request #348 from prabeesh/master
+  [spark -> org.apache.spark]
+
+  c3cf047 Tue Jan 7 00:54:25 2014 -0800
+  Merge pull request #339 from ScrapCodes/conf-improvements
+  [Conf improvements]
+
+  a862caf Tue Jan 7 00:18:20 2014 -0800
+  Merge pull request #331 from holdenk/master
+  [Add a script to download sbt if not present on the system]
+
+  b97ef21 Mon Jan 6 20:12:57 2014 -0800
+  Merge pull request #346 from sproblvem/patch-1
+  [Update stop-slaves.sh]
+
+  7210257 Mon Jan 6 18:25:44 2014 -0800
+  Merge pull request #128 from adamnovak/master
+  [Fix failing "sbt/sbt publish-local" by adding a no-argument PrimitiveKeyOpenHashMap constructor]
+
+  e4d6057 Mon Jan 6 14:56:54 2014 -0800
+  Merge pull request #343 from pwendell/build-fix
+  [Fix test breaking downstream builds]
+
+  93bf962 Mon Jan 6 11:42:41 2014 -0800
+  Merge pull request #340 from ScrapCodes/sbt-fixes
+  [Made java options to be applied during tests so that they become self explanatory.]
+
+  60edeb3 Mon Jan 6 11:40:32 2014 -0800
+  Merge pull request #338 from ScrapCodes/ning-upgrade
+  [SPARK-1005 Ning upgrade]
+
+  c708e81 Mon Jan 6 11:35:48 2014 -0800
+  Merge pull request #341 from ash211/patch-5
+  [Clarify spark.cores.max in docs]
+
+  33fcb91 Mon Jan 6 11:19:23 2014 -0800
+  Merge pull request #342 from tgravescs/fix_maven_protobuf
+  [Change protobuf version for yarn alpha back to 2.4.1]
+
+  357083c Mon Jan 6 10:29:04 2014 -0800
+  Merge pull request #330 from tgravescs/fix_addjars_null_handling
+  [Fix handling of empty SPARK_EXAMPLES_JAR]
+
+  a2e7e04 Sun Jan 5 22:37:36 2014 -0800
+  Merge pull request #333 from pwendell/logging-silence
+  [Quiet ERROR-level Akka Logs]
+
+  5b0986a Sun Jan 5 19:25:09 2014 -0800
+  Merge pull request #334 from pwendell/examples-fix
+  [Removing SPARK_EXAMPLES_JAR in the code]
+
+  f4b924f Sun Jan 5 17:11:47 2014 -0800
+  Merge pull request #335 from rxin/ser
+  [Fall back to zero-arg constructor for Serializer initialization if there is no constructor that accepts SparkConf.]
+
+  d43ad3e Sat Jan 4 16:29:30 2014 -0800
+  Merge pull request #292 from soulmachine/naive-bayes
+  [standard Naive Bayes classifier]
+
+  86404da Sat Jan 4 14:55:54 2014 -0800
+  Merge pull request #127 from jegonzal/MapByPartition
+  [Adding mapEdges and mapTriplets by Partition]
+
+  e68cdb1 Sat Jan 4 13:46:02 2014 -0800
+  Merge pull request #124 from jianpingjwang/master
+  [refactor and bug fix]
+
+  280ddf6 Sat Jan 4 12:54:41 2014 -0800
+  Merge pull request #121 from ankurdave/more-simplify
+  [Simplify GraphImpl internals further]
+
+  10fe23b Fri Jan 3 23:50:14 2014 -0800
+  Merge pull request #329 from pwendell/remove-binaries
+  [SPARK-1002: Remove Binaries from Spark Source]
+
+  c4d6145 Fri Jan 3 16:30:53 2014 -0800
+  Merge pull request #325 from witgo/master
+  [Modify spark on yarn to create SparkConf process]
+
+  4ae101f Fri Jan 3 11:24:35 2014 -0800
+  Merge pull request #317 from ScrapCodes/spark-915-segregate-scripts
+  [Spark-915 segregate scripts]
+
+  87248bd Fri Jan 3 00:45:31 2014 -0800
+  Merge pull request #1 from apache/master
+  [Merge latest Spark changes]
+
+  30b9db0 Thu Jan 2 23:15:55 2014 -0800
+  Merge pull request #285 from colorant/yarn-refactor
+  [Yarn refactor]
+
+  498a5f0 Thu Jan 2 19:06:40 2014 -0800
+  Merge pull request #323 from tgravescs/sparkconf_yarn_fix
+  [fix spark on yarn after the sparkConf changes]
+
+  0475ca8 Thu Jan 2 15:17:08 2014 -0800
+  Merge pull request #320 from kayousterhout/erroneous_failed_msg
+  [Remove erroneous FAILED state for killed tasks.]
+
+  588a169 Thu Jan 2 13:20:54 2014 -0800
+  Merge pull request #297 from tdas/window-improvement
+  [Improvements to DStream window ops and refactoring of Spark's CheckpointSuite]
+
+  5e67cdc Thu Jan 2 12:56:28 2014 -0800
+  Merge pull request #319 from kayousterhout/remove_error_method
+  [Removed redundant TaskSetManager.error() function.]
+
+  ca67909 Thu Jan 2 15:54:54 2014 -0500
+  Merge pull request #311 from tmyklebu/master
+  [SPARK-991: Report information gleaned from a Python stacktrace in the UI]
+
+  3713f81 Wed Jan 1 21:29:12 2014 -0800
+  Merge pull request #309 from mateiz/conf2
+  [SPARK-544. Migrate configuration to a SparkConf class]
+
+  c1d928a Wed Jan 1 17:03:48 2014 -0800
+  Merge pull request #312 from pwendell/log4j-fix-2
+  [SPARK-1008: Logging improvments]
+
+  dc9cb83 Wed Jan 1 13:28:34 2014 -0800
+  Merge pull request #126 from jegonzal/FixingPersist
+  [Fixing Persist Behavior]
+
+  9a0ff72 Tue Dec 31 21:50:24 2013 -0800
+  Merge pull request #314 from witgo/master
+  [restore core/pom.xml file modification]
+
+  8b8e70e Tue Dec 31 17:48:24 2013 -0800
+  Merge pull request #73 from falaki/ApproximateDistinctCount
+  [Approximate distinct count]
+
+  63b411d Tue Dec 31 14:31:28 2013 -0800
+  Merge pull request #238 from ngbinh/upgradeNetty
+  [upgrade Netty from 4.0.0.Beta2 to 4.0.13.Final]
+
+  32d6ae9 Tue Dec 31 13:51:07 2013 -0800
+  Merge pull request #120 from ankurdave/subgraph-reuses-view
+  [Reuse VTableReplicated in GraphImpl.subgraph]
+
+  55b7e2f Tue Dec 31 10:12:51 2013 -0800
+  Merge pull request #289 from tdas/filestream-fix
+  [Bug fixes for file input stream and checkpointing]
+
+  2b71ab9 Mon Dec 30 11:01:30 2013 -0800
+  Merge pull request from aarondav: Utilize DiskBlockManager pathway for temp file writing
+  [This gives us a couple advantages:]
+
+  50e3b8e Mon Dec 30 07:44:26 2013 -0800
+  Merge pull request #308 from kayousterhout/stage_naming
+  [Changed naming of StageCompleted event to be consistent]
+
+  72a17b6 Sat Dec 28 21:25:40 2013 -1000
+  Revert "Merge pull request #310 from jyunfan/master"
+  [This reverts commit 79b20e4dbe3dcd8559ec8316784d3334bb55868b, reversing]
+
+  79b20e4 Sat Dec 28 21:13:36 2013 -1000
+  Merge pull request #310 from jyunfan/master
+  [Fix typo in the Accumulators section]
+
+  7375047 Sat Dec 28 13:25:06 2013 -0800
+  Merge pull request #304 from kayousterhout/remove_unused
+  [Removed unused failed and causeOfFailure variables (in TaskSetManager)]
+
+  ad3dfd1 Fri Dec 27 22:10:14 2013 -0500
+  Merge pull request #307 from kayousterhout/other_failure
+  [Removed unused OtherFailure TaskEndReason.]
+
+  b579b83 Fri Dec 27 22:09:04 2013 -0500
+  Merge pull request #306 from kayousterhout/remove_pending
+  [Remove unused hasPendingTasks methods]
+
+  19672dc Fri Dec 27 13:37:10 2013 -0800
+  Merge pull request #305 from kayousterhout/line_spacing
+  [Fixed >100char lines in DAGScheduler.scala]
+
+  7be1e57 Thu Dec 26 23:41:40 2013 -1000
+  Merge pull request #298 from aarondav/minor
+  [Minor: Decrease margin of left side of Log page]
+
+  7d811ba Thu Dec 26 23:39:58 2013 -1000
+  Merge pull request #302 from pwendell/SPARK-1007
+  [SPARK-1007: spark-class2.cmd should change SCALA_VERSION to be 2.10]
+
+  5e69fc5 Thu Dec 26 19:10:39 2013 -0500
+  Merge pull request #295 from markhamstra/JobProgressListenerNPE
+  [Avoid a lump of coal (NPE) in JobProgressListener's stocking.]
+
+  da20270 Thu Dec 26 12:11:52 2013 -0800
+  Merge pull request #1 from aarondav/driver
+  [Refactor DriverClient to be more Actor-based]
+
+  e240bad Thu Dec 26 12:30:48 2013 -0500
+  Merge pull request #296 from witgo/master
+  [Renamed ClusterScheduler to TaskSchedulerImpl for yarn and new-yarn package]
+
+  c344ed0 Thu Dec 26 01:31:06 2013 -0500
+  Merge pull request #283 from tmyklebu/master
+  [Python bindings for mllib]
+
+  56094bc Wed Dec 25 13:14:33 2013 -0500
+  Merge pull request #290 from ash211/patch-3
+  [Typo: avaiable -> available]
+
+  4842a07 Wed Dec 25 01:52:15 2013 -0800
+  Merge pull request #287 from azuryyu/master
+  [Fixed job name in the java streaming example.]
+
+  85a344b Tue Dec 24 16:35:06 2013 -0800
+  Merge pull request #127 from kayousterhout/consolidate_schedulers
+  [Deduplicate Local and Cluster schedulers.]
+
+  c2dd6bc Tue Dec 24 14:36:47 2013 -0800
+  Merge pull request #279 from aarondav/shuffle-cleanup0
+  [Clean up shuffle files once their metadata is gone]
+
+  3bf7c70 Tue Dec 24 16:37:13 2013 -0500
+  Merge pull request #275 from ueshin/wip/changeclasspathorder
+  [Change the order of CLASSPATH.]
+
+  d63856c Mon Dec 23 22:07:26 2013 -0800
+  Merge pull request #286 from rxin/build
+  [Show full stack trace and time taken in unit tests.]
+
+  23a9ae6 Tue Dec 24 00:08:48 2013 -0500
+  Merge pull request #277 from tdas/scheduler-update
+  [Refactored the streaming scheduler and added StreamingListener interface]
+
+  11107c9 Mon Dec 23 10:38:20 2013 -0800
+  Merge pull request #244 from leftnoteasy/master
+  [Added SPARK-968 implementation for review]
+
+  44e4205 Sun Dec 22 11:44:18 2013 -0800
+  Merge pull request #116 from jianpingjwang/master
+  [remove unused variables and fix a bug]
+
+  4797c22 Fri Dec 20 13:30:39 2013 -0800
+  Merge pull request #118 from ankurdave/VertexPartitionSuite
+  [Test VertexPartition and fix bugs]
+
+  0bc57c5 Fri Dec 20 11:56:54 2013 -0800
+  Merge pull request #280 from aarondav/minor
+  [Minor cleanup for standalone scheduler]
+
+  ac70b8f Fri Dec 20 10:56:10 2013 -0800
+  Merge pull request #117 from ankurdave/more-tests
+  [More tests]
+
+  45310d4 Thu Dec 19 22:08:20 2013 -0800
+  Merge pull request #115 from ankurdave/test-reorg
+  [Reorganize unit tests; improve GraphSuite test coverage]
+
+  9228ec8 Thu Dec 19 21:37:15 2013 -0800
+  Merge pull request #1 from aarondav/127
+  [Merge master into 127]
+
+  eca68d4 Thu Dec 19 18:12:22 2013 -0800
+  Merge pull request #272 from tmyklebu/master
+  [Track and report task result serialisation time.]
+
+  7990c56 Thu Dec 19 13:35:09 2013 -0800
+  Merge pull request #276 from shivaram/collectPartition
+  [Add collectPartition to JavaRDD interface.]
+
+  440e531 Thu Dec 19 10:38:56 2013 -0800
+  Merge pull request #278 from MLnick/java-python-tostring
+  [Add toString to Java RDD, and __repr__ to Python RDD]
+
+  d8d3f3e Thu Dec 19 00:06:43 2013 -0800
+  Merge pull request #183 from aarondav/spark-959
+  [[SPARK-959] Explicitly depend on org.eclipse.jetty.orbit jar]
+
+  bfba532 Wed Dec 18 22:22:21 2013 -0800
+  Merge pull request #247 from aarondav/minor
+  [Increase spark.akka.askTimeout default to 30 seconds]
+
+  da301b5 Wed Dec 18 20:03:29 2013 -0800
+  Merge pull request #112 from amatsukawa/scc
+  [Strongly connected component algorithm]
+
+  c64a53a Wed Dec 18 16:56:26 2013 -0800
+  Merge pull request #267 from JoshRosen/cygwin
+  [Fix Cygwin support in several scripts.]
+
+  a645ef6 Wed Dec 18 16:07:52 2013 -0800
+  Merge pull request #48 from amatsukawa/add_project_to_graph
+  [Add mask operation on graph and filter graph primitive]
+
+  d7ebff0 Wed Dec 18 15:38:48 2013 -0800
+  Merge pull request #1 from ankurdave/add_project_to_graph
+  [Merge current master and reimplement Graph.mask using innerJoin]
+
+  5ea1872 Wed Dec 18 15:27:24 2013 -0800
+  Merge pull request #274 from azuryy/master
+  [Fixed the example link in the Scala programing guid.]
+
+  3fd2e09 Wed Dec 18 12:52:36 2013 -0800
+  Merge pull request #104 from jianpingjwang/master
+  [SVD++ demo]
+
+  f4effb3 Tue Dec 17 22:26:21 2013 -0800
+  Merge pull request #273 from rxin/top
+  [Fixed a performance problem in RDD.top and BoundedPriorityQueue]
+
+  1b5eacb Tue Dec 17 13:49:17 2013 -0800
+  Merge pull request #102 from ankurdave/clustered-edge-index
+  [Add clustered index on edges by source vertex]
+
+  7a8169b Mon Dec 16 22:42:21 2013 -0800
+  Merge pull request #268 from pwendell/shaded-protobuf
+  [Add support for 2.2. to master (via shaded jars)]
+
+  0476c84 Mon Dec 16 17:19:25 2013 -0800
+  Merge pull request #100 from ankurdave/mrTriplets-active-set
+  [Support activeSet option in mapReduceTriplets]
+
+  964a3b6 Mon Dec 16 15:23:51 2013 -0800
+  Merge pull request #270 from ewencp/really-force-ssh-pseudo-tty-master
+  [Force pseudo-tty allocation in spark-ec2 script.]
+
+  5192ef3 Mon Dec 16 15:08:08 2013 -0800
+  Merge pull request #94 from ankurdave/load-edges-columnar
+  [Load edges in columnar format]
+
+  883e034 Mon Dec 16 14:16:02 2013 -0800
+  Merge pull request #245 from gregakespret/task-maxfailures-fix
+  [Fix for spark.task.maxFailures not enforced correctly.]
+
+  a51f340 Sun Dec 15 22:02:30 2013 -0800
+  Merge pull request #265 from markhamstra/scala.binary.version
+  [DRY out the POMs with scala.binary.version]
+
+  ded10ce Sun Dec 15 17:25:33 2013 -0800
+  Merge pull request #103 from amplab/optimizations
+  [Optimizations cherry-picked from SIGMOD branches]
+
+  d2ced6d Sun Dec 15 14:11:34 2013 -0800
+  Merge pull request #256 from MLnick/master
+  [Fix 'IPYTHON=1 ./pyspark' throwing ValueError]
+
+  c55e698 Sun Dec 15 12:49:02 2013 -0800
+  Merge pull request #257 from tgravescs/sparkYarnFixName
+  [Fix the --name option for Spark on Yarn]
+
+  ab85f88 Sun Dec 15 12:48:32 2013 -0800
+  Merge pull request #264 from shivaram/spark-class-fix
+  [Use CoarseGrainedExecutorBackend in spark-class]
+
+  8a56c1f Sat Dec 14 16:29:24 2013 -0800
+  Merge pull request #84 from amatsukawa/graphlab_enhancements
+  [GraphLab bug fix & set start vertex]
+
+  7db9165 Sat Dec 14 14:16:34 2013 -0800
+  Merge pull request #251 from pwendell/master
+  [Fix list rendering in YARN markdown docs.]
+
+  2fd781d Sat Dec 14 12:59:37 2013 -0800
+  Merge pull request #249 from ngbinh/partitionInJavaSortByKey
+  [Expose numPartitions parameter in JavaPairRDD.sortByKey()]
+
+  9bf192b Sat Dec 14 12:52:18 2013 -0800
+  Merge pull request #91 from amplab/standalone-pagerank
+  [Standalone PageRank]
+
+  840af5e Sat Dec 14 12:51:51 2013 -0800
+  Merge pull request #99 from ankurdave/only-dynamic-pregel
+  [Remove static Pregel; take maxIterations in dynamic Pregel]
+
+  97ac060 Sat Dec 14 00:22:45 2013 -0800
+  Merge pull request #259 from pwendell/scala-2.10
+  [Migration to Scala 2.10]
+
+  7ac944f Fri Dec 13 23:22:08 2013 -0800
+  Merge pull request #262 from pwendell/mvn-fix
+  [Fix maven build issues in 2.10 branch]
+
+  6defb06 Fri Dec 13 21:18:57 2013 -0800
+  Merge pull request #261 from ScrapCodes/scala-2.10
+  [Added a comment about ActorRef and ActorSelection difference.]
+
+  76566b1 Fri Dec 13 10:11:02 2013 -0800
+  Merge pull request #260 from ScrapCodes/scala-2.10
+  [Review comments on the PR for scala 2.10 migration.]
+
+  0aeb182 Thu Dec 12 21:14:42 2013 -0800
+  Merge pull request #255 from ScrapCodes/scala-2.10
+  [Disabled yarn 2.2 in sbt and mvn build and added a message in the sbt build.]
+
+  2e89398 Wed Dec 11 23:10:53 2013 -0800
+  Merge pull request #254 from ScrapCodes/scala-2.10
+  [Scala 2.10 migration]
+
+  ce6ca4e Wed Dec 11 22:30:54 2013 -0800
+  Merge pull request #97 from dcrankshaw/fix-rddtop
+  [Added BoundedPriorityQueue kryo registrator. Fixes top issue.]
+
+  d2efe13 Tue Dec 10 13:01:26 2013 -0800
+  Merge pull request #250 from pwendell/master
+  [README incorrectly suggests build sources spark-env.sh]
+
+  6169fe1 Mon Dec 9 16:51:36 2013 -0800
+  Merge pull request #246 from pwendell/master
+  [Add missing license headers]
+
+  d992ec6 Sun Dec 8 20:49:20 2013 -0800
+  Merge pull request #195 from dhardy92/fix_DebScriptPackage
+  [[Deb] fix package of Spark classes adding org.apache prefix in scripts embeded in .deb]
+
+  1f4a4bc Sat Dec 7 22:34:34 2013 -0800
+  Merge pull request #242 from pwendell/master
+  [Update broken links and add HDP 2.0 version string]
+
+  6494d62 Sat Dec 7 11:56:16 2013 -0800
+  Merge pull request #240 from pwendell/master
+  [SPARK-917 Improve API links in nav bar]
+
+  f466f79 Sat Dec 7 11:51:52 2013 -0800
+  Merge pull request #239 from aarondav/nit
+  [Correct spellling error in configuration.md]
+
+  3abfbfb Sat Dec 7 11:24:19 2013 -0800
+  Merge pull request #92 from ankurdave/rdd-names
+  [Set RDD names for easy debugging]
+
+  31e8a14 Fri Dec 6 21:49:55 2013 -0800
+  Merge pull request #90 from amplab/pregel-replicate-changed
+  [Replicate only changed vertices]
+
+  10c3c0c Fri Dec 6 20:29:45 2013 -0800
+  Merge pull request #237 from pwendell/formatting-fix
+  [Formatting fix]
+
+  1b38f5f Fri Dec 6 20:16:15 2013 -0800
+  Merge pull request #236 from pwendell/shuffle-docs
+  [Adding disclaimer for shuffle file consolidation]
+
+  e5d5728 Fri Dec 6 20:14:56 2013 -0800
+  Merge pull request #235 from pwendell/master
+  [Minor doc fixes and updating README]
+
+  241336a Fri Dec 6 17:29:03 2013 -0800
+  Merge pull request #234 from alig/master
+  [Updated documentation about the YARN v2.2 build process]
+
+  e039234 Fri Dec 6 11:49:59 2013 -0800
+  Merge pull request #190 from markhamstra/Stages4Jobs
+  [stageId <--> jobId mapping in DAGScheduler]
+
+  bfa6860 Fri Dec 6 11:04:03 2013 -0800
+  Merge pull request #233 from hsaputra/changecontexttobackend
+  [Change the name of input argument in ClusterScheduler#initialize from context to backend.]
+
+  3fb302c Fri Dec 6 11:03:32 2013 -0800
+  Merge pull request #205 from kayousterhout/logging
+  [Added logging of scheduler delays to UI]
+
+  87676a6 Fri Dec 6 11:01:42 2013 -0800
+  Merge pull request #220 from rxin/zippart
+  [Memoize preferred locations in ZippedPartitionsBaseRDD]
+
+  0780498 Thu Dec 5 23:29:42 2013 -0800
+  Merge pull request #232 from markhamstra/FiniteWait
+  [jobWaiter.synchronized before jobWaiter.wait]
+
+  1c8500e Thu Dec 5 16:25:44 2013 -0800
+  Merge pull request #88 from amplab/varenc
+  [Fixed a bug that variable encoding doesn't work for ints that use all 64 bits.]
+
+  e0bcaa0 Thu Dec 5 12:37:02 2013 -0800
+  Merge pull request #86 from ankurdave/vid-varenc
+  [Finish work on #85]
+
+  5d46025 Thu Dec 5 12:31:24 2013 -0800
+  Merge pull request #228 from pwendell/master
+  [Document missing configs and set shuffle consolidation to false.]
+
+  3e96b9a Thu Dec 5 12:07:36 2013 -0800
+  Merge pull request #85 from ankurdave/vid-varenc
+  [Always write Vids using variable encoding]
+
+  72b6961 Wed Dec 4 23:33:04 2013 -0800
+  Merge pull request #199 from harveyfeng/yarn-2.2
+  [Hadoop 2.2 migration]
+
+  e0347ba Wed Dec 4 17:38:06 2013 -0800
+  Merge pull request #83 from ankurdave/fix-tests
+  [Fix compile errors in GraphSuite and SerializerSuite]
+
+  182f9ba Wed Dec 4 15:52:07 2013 -0800
+  Merge pull request #227 from pwendell/master
+  [Fix small bug in web UI and minor clean-up.]
+
+  cbd3b75 Wed Dec 4 15:35:26 2013 -0800
+  Merge pull request #81 from amplab/clean1
+  [Codebase refactoring]
+
+  b9e7609 Wed Dec 4 14:42:09 2013 -0800
+  Merge pull request #225 from ash211/patch-3
+  [Add missing space after "Serialized" in StorageLevel]
+
+  055462c Wed Dec 4 14:02:11 2013 -0800
+  Merge pull request #226 from ash211/patch-4
+  [Typo: applicaton]
+
+  d6e5473 Wed Dec 4 10:28:50 2013 -0800
+  Merge pull request #223 from rxin/transient
+  [Mark partitioner, name, and generator field in RDD as @transient.]
+
+  8a3475a Tue Dec 3 14:21:40 2013 -0800
+  Merge pull request #218 from JoshRosen/spark-970-pyspark-unicode-error
+  [Fix UnicodeEncodeError in PySpark saveAsTextFile() (SPARK-970)]
+
+  46b87b8 Tue Dec 3 00:41:11 2013 -0800
+  Merge pull request #2 from colorant/yarn-client-2.2
+  [Fix pom.xml for maven build]
+
+  58d9bbc Mon Dec 2 21:58:53 2013 -0800
+  Merge pull request #217 from aarondav/mesos-urls
+  [Re-enable zk:// urls for Mesos SparkContexts]
+
+  740922f Sun Dec 1 12:46:58 2013 -0800
+  Merge pull request #219 from sundeepn/schedulerexception
+  [Scheduler quits when newStage fails]
+
+  60e23a5 Sat Nov 30 23:38:49 2013 -0800
+  Merge pull request #216 from liancheng/fix-spark-966
+  [Bugfix: SPARK-965 & SPARK-966]
+
+  34ee814 Sat Nov 30 15:10:30 2013 -0800
+  Merged Ankur's pull request #80 and fixed subgraph.
+  []
+
+  743a31a Wed Nov 27 18:24:39 2013 -0800
+  Merge pull request #210 from haitaoyao/http-timeout
+  [add http timeout for httpbroadcast]
+
+  993e293 Wed Nov 27 00:57:54 2013 -0800
+  Merge pull request #1 from colorant/yarn-client-2.2
+  [Port yarn-client mode for new-yarn]
+
+  fb6875d Tue Nov 26 20:55:40 2013 -0800
+  Merge pull request #146 from JoshRosen/pyspark-custom-serializers
+  [Custom Serializers for PySpark]
+
+  330ada1 Tue Nov 26 19:08:33 2013 -0800
+  Merge pull request #207 from henrydavidge/master
+  [Log a warning if a task's serialized size is very big]
+
+  615213f Tue Nov 26 19:07:20 2013 -0800
+  Merge pull request #212 from markhamstra/SPARK-963
+  [[SPARK-963] Fixed races in JobLoggerSuite]
+
+  cb976df Tue Nov 26 10:23:19 2013 -0800
+  Merge pull request #209 from pwendell/better-docs
+  [Improve docs for shuffle instrumentation]
+
+  18d6df0 Tue Nov 26 00:00:07 2013 -0800
+  Merge pull request #86 from holdenk/master
+  [Add histogram functionality to DoubleRDDFunctions]
+
+  0e2109d Mon Nov 25 20:48:37 2013 -0800
+  Merge pull request #204 from rxin/hash
+  [OpenHashSet fixes]
+
+  c46067f Mon Nov 25 19:09:31 2013 -0800
+  Merge pull request #206 from ash211/patch-2
+  [Update tuning.md]
+
+  14bb465 Mon Nov 25 18:50:18 2013 -0800
+  Merge pull request #201 from rxin/mappartitions
+  [Use the proper partition index in mapPartitionsWIthIndex]
+
+  eb4296c Mon Nov 25 15:25:29 2013 -0800
+  Merge pull request #101 from colorant/yarn-client-scheduler
+  [For SPARK-527, Support spark-shell when running on YARN]
+
+  466fd06 Mon Nov 25 18:27:26 2013 +0800
+  Incorporated ideas from pull request #200. - Use Murmur Hash 3 finalization step to scramble the bits of HashCode   instead of the simpler version in java.util.HashMap; the latter one   had trouble with ranges of consecutive integers. Murmur Hash 3 is used   by fastutil.
+  [- Don't check keys for equality when re-inserting due to growing the]
+
+  088995f Mon Nov 25 00:57:51 2013 -0800
+  Merge pull request #77 from amplab/upgrade
+  [Sync with Spark master]
+
+  62889c4 Mon Nov 25 11:27:45 2013 +0800
+  Merge pull request #203 from witgo/master
+  [ Fix Maven build for metrics-graphite]
+
+  6af03ed Sun Nov 24 16:42:37 2013 -0800
+  Merge pull request #76 from dcrankshaw/fix_partitioners
+  [Actually use partitioner command line args in Analytics.]
+
+  859d62d Sun Nov 24 16:19:51 2013 -0800
+  Merge pull request #151 from russellcardullo/add-graphite-sink
+  [Add graphite sink for metrics]
+
+  65de73c Sun Nov 24 15:52:33 2013 -0800
+  Merge pull request #185 from mkolod/random-number-generator
+  [XORShift RNG with unit tests and benchmark]
+
+  972171b Mon Nov 25 07:50:46 2013 +0800
+  Merge pull request #197 from aarondav/patrick-fix
+  [Fix 'timeWriting' stat for shuffle files]
+
+  a1a7e36 Sun Nov 24 05:15:09 2013 -0800
+  Merge pull request #75 from amplab/simplify
+  [Simplify GraphImpl internals]
+
+  718cc80 Sun Nov 24 11:02:02 2013 +0800
+  Merge pull request #200 from mateiz/hash-fix
+  [AppendOnlyMap fixes]
+
+  51aa9d6 Sat Nov 23 19:46:46 2013 +0800
+  Merge pull request #198 from ankurdave/zipPartitions-preservesPartitioning
+  [Support preservesPartitioning in RDD.zipPartitions]
+
+  18ce7e9 Fri Nov 22 17:02:40 2013 -0800
+  Merge pull request #73 from jegonzal/TriangleCount
+  [Triangle count]
+
+  086b097 Fri Nov 22 10:26:39 2013 +0800
+  Merge pull request #193 from aoiwelle/patch-1
+  [Fix Kryo Serializer buffer documentation inconsistency]
+
+  f20093c Fri Nov 22 10:12:13 2013 +0800
+  Merge pull request #196 from pwendell/master
+  [TimeTrackingOutputStream should pass on calls to close() and flush().]
+
+  4b89501 Wed Nov 20 10:36:10 2013 -0800
+  Merge pull request #191 from hsaputra/removesemicolonscala
+  [Cleanup to remove semicolons (;) from Scala code]
+
+  202f8e6 Wed Nov 20 03:26:08 2013 -0800
+  Merge pull request #74 from dcrankshaw/remove_sleep
+  [Removed sleep from pagerank in Analytics]
+
+  74ade9e Tue Nov 19 16:53:58 2013 -0800
+  Merge pull request #62 from dcrankshaw/partitioners
+  [Allow user to choose a partitioner at runtime]
+
+  f568912 Tue Nov 19 16:11:31 2013 -0800
+  Merge pull request #181 from BlackNiuza/fix_tasks_number
+  [correct number of tasks in ExecutorsUI]
+
+  aa638ed Tue Nov 19 16:05:44 2013 -0800
+  Merge pull request #189 from tgravescs/sparkYarnErrorHandling
+  [Impove Spark on Yarn Error handling]
+
+  5592580 Tue Nov 19 16:04:01 2013 -0800
+  Merge pull request #187 from aarondav/example-bcast-test
+  [Enable the Broadcast examples to work in a cluster setting]
+
+  99cfe89 Mon Nov 18 22:00:36 2013 -0500
+  Updates to reflect pull request code review
+  []
+
+  e2ebc3a Sun Nov 17 18:42:18 2013 -0800
+  Merge pull request #182 from rxin/vector
+  [Slightly enhanced PrimitiveVector:]
+
+  26f616d Sun Nov 17 18:18:16 2013 -0800
+  Merge pull request #3 from aarondav/pv-test
+  [Add PrimitiveVectorSuite and fix bug in resize()]
+
+  1b5b358 Sat Nov 16 11:44:10 2013 -0800
+  Merge pull request #178 from hsaputra/simplecleanupcode
+  [Simple cleanup on Spark's Scala code]
+
+  62a2a71 Fri Nov 15 13:12:07 2013 -0800
+  Merge pull request #65 from amplab/varenc
+  [Use variable encoding for ints, longs, and doubles in the specialized serializers.]
+
+  f6b2e59 Thu Nov 14 23:04:55 2013 -0800
+  Merge pull request #1 from aarondav/scala210-master
+  [Various merge corrections]
+
+  96e0fb4 Thu Nov 14 22:29:28 2013 -0800
+  Merge pull request #173 from kayousterhout/scheduler_hang
+  [Fix bug where scheduler could hang after task failure.]
+
+  dfd40e9 Thu Nov 14 19:44:50 2013 -0800
+  Merge pull request #175 from kayousterhout/no_retry_not_serializable
+  [Don't retry tasks when they fail due to a NotSerializableException]
+
+  ed25105 Thu Nov 14 19:43:55 2013 -0800
+  Merge pull request #174 from ahirreddy/master
+  [Write Spark UI url to driver file on HDFS]
+
+  1a4cfbe Thu Nov 14 10:32:11 2013 -0800
+  Merge pull request #169 from kayousterhout/mesos_fix
+  [Don't ignore spark.cores.max when using Mesos Coarse mode]
+
+  5a4f483 Thu Nov 14 10:30:36 2013 -0800
+  Merge pull request #170 from liancheng/hadooprdd-doc-typo
+  [Fixed a scaladoc typo in HadoopRDD.scala]
+
+  d76f520 Thu Nov 14 10:25:48 2013 -0800
+  Merge pull request #171 from RIA-pierre-borckmans/master
+  [Fixed typos in the CDH4 distributions version codes.]
+
+  2c39d80 Wed Nov 13 23:28:01 2013 -0800
+  Merge pull request #69 from jegonzal/MissingVertices
+  [Addressing issue in Graph creation]
+
+  33b2dea Wed Nov 13 17:55:58 2013 -0800
+  Merge pull request #1 from ankurdave/MissingVertices
+  [During graph creation, create eTable earlier]
+
+  2054c61 Wed Nov 13 16:49:55 2013 -0800
+  Merge pull request #159 from liancheng/dagscheduler-actor-refine
+  [Migrate the daemon thread started by DAGScheduler to Akka actor]
+
+  9290e5b Wed Nov 13 16:48:44 2013 -0800
+  Merge pull request #165 from NathanHowell/kerberos-master
+  [spark-assembly.jar fails to authenticate with YARN ResourceManager]
+
+  a81fcb7 Wed Nov 13 10:41:01 2013 -0800
+  Merge pull request #68 from jegonzal/BitSetSetUntilBug
+  [Addressing bug in BitSet.setUntil(ind)]
+
+  39af914 Wed Nov 13 08:39:05 2013 -0800
+  Merge pull request #166 from ahirreddy/simr-spark-ui
+  [SIMR Backend Scheduler will now write Spark UI URL to HDFS, which is to ...]
+
+  f49ea28 Tue Nov 12 19:13:39 2013 -0800
+  Merge pull request #137 from tgravescs/sparkYarnJarsHdfsRebase
+  [Allow spark on yarn to be run from HDFS.]
+
+  87f2f4e Tue Nov 12 16:26:09 2013 -0800
+  Merge pull request #153 from ankurdave/stop-spot-cluster
+  [Enable stopping and starting a spot cluster]
+
+  b8bf04a Tue Nov 12 16:19:50 2013 -0800
+  Merge pull request #160 from xiajunluan/JIRA-923
+  [Fix bug JIRA-923]
+
+  dfd1ebc Tue Nov 12 09:10:05 2013 -0800
+  Merge pull request #164 from tdas/kafka-fix
+  [Made block generator thread safe to fix Kafka bug.]
+
+  2e8d450 Mon Nov 11 17:34:09 2013 -0800
+  Merge pull request #63 from jegonzal/VertexSetCleanup
+  [Cleanup of VertexSetRDD]
+
+  b8e294a Mon Nov 11 16:25:42 2013 -0800
+  Merge pull request #61 from ankurdave/pid2vid
+  [Shuffle replicated vertex attributes efficiently in columnar format]
+
+  3d7277c Mon Nov 11 15:49:28 2013 -0800
+  Merge pull request #55 from ankurdave/aggregateNeighbors-variants
+  [Specialize mapReduceTriplets for accessing subsets of vertex attributes]
+
+  23b53ef Mon Nov 11 12:30:02 2013 -0800
+  Merge pull request #156 from haoyuan/master
+  [add tachyon module]
+
+  1a06f70 Sun Nov 10 10:54:44 2013 -0800
+  Merge pull request #60 from amplab/rxin
+  [Looks good to me.]
+
+  58d4f6c Sun Nov 10 09:23:56 2013 -0800
+  Merge pull request #157 from rxin/kryo
+  [3 Kryo related changes.]
+
+  3efc019 Sat Nov 9 17:53:49 2013 -0800
+  Merge pull request #147 from JoshRosen/fix-java-api-completeness-checker
+  [Add spark-tools assembly to spark-class'ss classpath]
+
+  87954d4 Sat Nov 9 17:53:25 2013 -0800
+  Merge pull request #154 from soulmachine/ClusterScheduler
+  [Replace the thread inside ClusterScheduler.start() with an Akka scheduler]
+
+  f6c9462 Sat Nov 9 16:14:45 2013 -0800
+  Merge pull request #58 from jegonzal/KryoMessages
+  [Kryo messages]
+
+  83bf192 Sat Nov 9 15:40:29 2013 -0800
+  Merge pull request #155 from rxin/jobgroup
+  [Don't reset job group when a new job description is set.]
+
+  8af99f2 Sat Nov 9 13:48:00 2013 -0800
+  Merge pull request #149 from tgravescs/fixSecureHdfsAccess
+  [Fix secure hdfs access for spark on yarn]
+
+  72a601e Sat Nov 9 11:55:16 2013 -0800
+  Merge pull request #152 from rxin/repl
+  [Propagate SparkContext local properties from spark-repl caller thread to the repl execution thread.]
+
+  6ee05be Thu Nov 7 19:12:41 2013 -0800
+  Merge pull request #49 from jegonzal/graphxshell
+  [GraphX Console with Logo Text]
+
+  a9f96b5 Thu Nov 7 18:56:56 2013 -0800
+  Merge pull request #56 from jegonzal/PregelAPIChanges
+  [Changing Pregel API to use mapReduceTriplets instead of aggregateNeighbors]
+
+  5907137 Thu Nov 7 16:58:31 2013 -0800
+  Merge pull request #54 from amplab/rxin
+  [Converted for loops to while loops in EdgePartition.]
+
+  edf4164 Thu Nov 7 16:22:43 2013 -0800
+  Merge pull request #53 from amplab/rxin
+  [Added GraphX to classpath.]
+
+  c379e10 Thu Nov 7 16:01:47 2013 -0800
+  Merge pull request #51 from jegonzal/VertexSetRDD
+  [Reverting to Array based (materialized) output in VertexSetRDD]
+
+  3d4ad84 Thu Nov 7 11:08:27 2013 -0800
+  Merge pull request #148 from squito/include_appId
+  [Include appId in executor cmd line args]
+
+  be7e8da Wed Nov 6 23:22:47 2013 -0800
+  Merge pull request #23 from jerryshao/multi-user
+  [Add Spark multi-user support for standalone mode and Mesos]
+
+  aadeda5 Wed Nov 6 13:27:47 2013 -0800
+  Merge pull request #144 from liancheng/runjob-clean
+  [Removed unused return value in SparkContext.runJob]
+
+  951024f Wed Nov 6 09:36:14 2013 -0800
+  Merge pull request #145 from aarondav/sls-fix
+  [Attempt to fix SparkListenerSuite breakage]
+
+  bf4e613 Tue Nov 5 23:14:09 2013 -0800
+  Merge pull request #143 from rxin/scheduler-hang
+  [Ignore a task update status if the executor doesn't exist anymore.]
+
+  9f7b9bb Tue Nov 5 10:42:19 2013 -0800
+  Merge pull request #142 from liancheng/dagscheduler-pattern-matching
+  [Using case class deep match to simplify code in DAGScheduler.processEvent]
+
+  ca44b51 Tue Nov 5 01:32:55 2013 -0800
+  Merge pull request #50 from amplab/mergemerge
+  [Merge Spark master into graphx]
+
+  8106532 Mon Nov 4 20:47:14 2013 -0800
+  Merge pull request #139 from aarondav/shuffle-next
+  [Never store shuffle blocks in BlockManager]
+
+  0b26a39 Mon Nov 4 18:22:06 2013 -0800
+  Merge pull request #128 from shimingfei/joblogger-doc
+  [add javadoc to JobLogger, and some small fix]
+
+  7a26104 Mon Nov 4 17:54:06 2013 -0800
+  Merge pull request #130 from aarondav/shuffle
+  [Memory-optimized shuffle file consolidation]
+
+  b5dc339 Sun Nov 3 20:43:15 2013 -0800
+  Merge pull request #70 from rxin/hash1
+  [Fast, memory-efficient hash set, hash table implementations optimized for primitive data types.]
+
+  41ead7a Sat Nov 2 14:41:50 2013 -0700
+  Merge pull request #133 from Mistobaan/link_fix
+  [update default github]
+
+  d407c07 Sat Nov 2 14:36:37 2013 -0700
+  Merge pull request #134 from rxin/readme
+  [Fixed a typo in Hadoop version in README.]
+
+  e7c7b80 Fri Nov 1 17:58:10 2013 -0700
+  Merge pull request #132 from Mistobaan/doc_fix
+  [fix persistent-hdfs]
+
+  d6d11c2 Fri Nov 1 15:40:33 2013 -0700
+  Merge pull request #129 from velvia/2013-11/document-local-uris
+  [Document & finish support for local: URIs]
+
+  99bfcc9 Thu Oct 31 21:38:10 2013 -0700
+  Merge pull request #46 from jegonzal/VertexSetWithHashSet
+  [Switched VertexSetRDD and GraphImpl to use OpenHashSet]
+
+  fcaaf86 Thu Oct 31 18:27:30 2013 -0700
+  Merge pull request #44 from jegonzal/rxinBitSet
+  [Switching to VertexSetRDD to use @rxin BitSet and OpenHash ]
+
+  3f3c727 Thu Oct 31 09:52:25 2013 -0700
+  Merge pull request #41 from jegonzal/LineageTracking
+  [Optimizing Graph Lineage]
+
+  944f6b8 Thu Oct 31 09:40:35 2013 -0700
+  Merge pull request #43 from amplab/FixBitSetCastException
+  [Fix BitSet cast exception]
+
+  8f1098a Wed Oct 30 20:11:48 2013 -0700
+  Merge pull request #117 from stephenh/avoid_concurrent_modification_exception
+  [Handle ConcurrentModificationExceptions in SparkContext init.]
+
+  dc9ce16 Wed Oct 30 17:01:56 2013 -0700
+  Merge pull request #126 from kayousterhout/local_fix
+  [Fixed incorrect log message in local scheduler]
+
+  33de11c Wed Oct 30 16:58:27 2013 -0700
+  Merge pull request #124 from tgravescs/sparkHadoopUtilFix
+  [Pull SparkHadoopUtil out of SparkEnv (jira SPARK-886)]
+
+  a0c86c3 Wed Oct 30 15:34:39 2013 -0700
+  Merge pull request #38 from jegonzal/Documentation
+  [Improving Documentation]
+
+  618c1f6 Wed Oct 30 12:03:44 2013 -0700
+  Merge pull request #125 from velvia/2013-10/local-jar-uri
+  [Add support for local:// URI scheme for addJars()]
+
+  745dc42 Tue Oct 29 23:47:10 2013 -0700
+  Merge pull request #118 from JoshRosen/blockinfo-memory-usage
+  [Reduce the memory footprint of BlockInfo objects]
+
+  06adf63 Tue Oct 29 16:43:46 2013 -0700
+  Merge pull request #33 from kellrott/master
+  [Fixing graph/pom.xml]
+
+  098768e Tue Oct 29 15:08:36 2013 -0700
+  Merge pull request #37 from jegonzal/AnalyticsCleanup
+  [Updated Connected Components and Pregel Docs]
+
+  f0e23a0 Tue Oct 29 01:41:44 2013 -0400
+  Merge pull request #119 from soulmachine/master
+  [A little revise for the document]
+
+  aec9bf9 Sun Oct 27 19:32:00 2013 -0700
+  Merge pull request #112 from kayousterhout/ui_task_attempt_id
+  [Display both task ID and task attempt ID in UI, and rename taskId to taskAttemptId]
+
+  d4df474 Sun Oct 27 22:11:21 2013 -0400
+  Merge pull request #115 from aarondav/shuffle-fix
+  [Eliminate extra memory usage when shuffle file consolidation is disabled]
+
+  e018f2d Sat Oct 26 11:39:15 2013 -0700
+  Merge pull request #113 from pwendell/master
+  [Improve error message when multiple assembly jars are present.]
+
+  662ee9f Sat Oct 26 11:35:59 2013 -0700
+  Merge pull request #114 from soulmachine/master
+  [A little revise for the document]
+
+  bab496c Fri Oct 25 18:28:43 2013 -0700
+  Merge pull request #108 from alig/master
+  [Changes to enable executing by using HDFS as a synchronization point between driver and executors, as well as ensuring executors exit properly.]
+
+  d307db6 Fri Oct 25 17:26:06 2013 -0700
+  Merge pull request #102 from tdas/transform
+  [Added new Spark Streaming operations]
+
+  85e2cab Fri Oct 25 14:46:06 2013 -0700
+  Merge pull request #111 from kayousterhout/ui_name
+  [Properly display the name of a stage in the UI.]
+
+  ab35ec4 Fri Oct 25 10:16:18 2013 -0700
+  Merge pull request #110 from pwendell/master
+  [Exclude jopt from kafka dependency.]
+
+  4f2c943 Thu Oct 24 22:32:02 2013 -0700
+  Merge pull request #109 from pwendell/master
+  [Adding Java/Java Streaming versions of `repartition` with associated tests]
+
+  99ad4a6 Thu Oct 24 17:08:39 2013 -0700
+  Merge pull request #106 from pwendell/master
+  [Add a `repartition` operator.]
+
+  5429d62 Thu Oct 24 11:15:55 2013 -0700
+  Merge pull request #107 from ScrapCodes/scala-2.10
+  [Updating to latest akka 2.2.3, which fixes our only failing test Driver Suite.]
+
+  6f82c42 Thu Oct 24 11:09:46 2013 -0700
+  Merge pull request #34 from jegonzal/AnalyticsCleanup
+  [Analytics Cleanup]
+
+  1dc776b Wed Oct 23 22:05:52 2013 -0700
+  Merge pull request #93 from kayousterhout/ui_new_state
+  [Show "GETTING_RESULTS" state in UI.]
+
+  c4b187d Wed Oct 23 21:56:18 2013 -0700
+  Merge pull request #105 from pwendell/doc-fix
+  [Fixing broken links in programming guide]
+
+  a098438 Wed Oct 23 18:03:08 2013 -0700
+  Merge pull request #103 from JoshRosen/unpersist-fix
+  [Add unpersist() to JavaDoubleRDD and JavaPairRDD.]
+
+  dd65964 Wed Oct 23 15:07:59 2013 -0700
+  Merge pull request #64 from prabeesh/master
+  [MQTT Adapter for Spark Streaming]
+
+  452aa36 Tue Oct 22 23:15:33 2013 -0700
+  Merge pull request #97 from ewencp/pyspark-system-properties
+  [Add classmethod to SparkContext to set system properties.]
+
+  9dfcf53 Tue Oct 22 16:01:42 2013 -0700
+  Merge pull request #100 from JoshRosen/spark-902
+  [Remove redundant Java Function call() definitions]
+
+  49d5cda Tue Oct 22 15:38:02 2013 -0700
+  Merge pull request #30 from jegonzal/VertexSetRDD_Tests
+  [Testing and Documenting VertexSetRDD]
+
+  97184de Tue Oct 22 13:10:14 2013 -0700
+  Merge pull request #99 from pwendell/master
+  [Use correct formatting for comments in StoragePerfTester]
+
+  c404adb Tue Oct 22 11:30:19 2013 -0700
+  Merge pull request #90 from pwendell/master
+  [SPARK-940: Do not directly pass Stage objects to SparkListener.]
+
+  aa9019f Tue Oct 22 10:30:02 2013 -0700
+  Merge pull request #98 from aarondav/docs
+  [Docs: Fix links to RDD API documentation]
+
+  a0e08f0 Tue Oct 22 10:20:43 2013 -0700
+  Merge pull request #82 from JoshRosen/map-output-tracker-refactoring
+  [Split MapOutputTracker into Master/Worker classes]
+
+  b84193c Mon Oct 21 23:35:13 2013 -0700
+  Merge pull request #92 from tgravescs/sparkYarnFixClasspath
+  [Fix the Worker to use CoarseGrainedExecutorBackend and modify classpath ...]
+
+  731c94e Mon Oct 21 23:31:38 2013 -0700
+  Merge pull request #56 from jerryshao/kafka-0.8-dev
+  [Upgrade Kafka 0.7.2 to Kafka 0.8.0-beta1 for Spark Streaming]
+
+  48952d6 Mon Oct 21 22:45:00 2013 -0700
+  Merge pull request #87 from aarondav/shuffle-base
+  [Basic shuffle file consolidation]
+
+  a51359c Mon Oct 21 20:33:29 2013 -0700
+  Merge pull request #95 from aarondav/perftest
+  [Minor: Put StoragePerfTester in org/apache/]
+
+  39d2e9b Mon Oct 21 18:58:48 2013 -0700
+  Merge pull request #94 from aarondav/mesos-fix
+  [Fix mesos urls]
+
+  aa61bfd Mon Oct 21 11:57:05 2013 -0700
+  Merge pull request #88 from rxin/clean
+  [Made the following traits/interfaces/classes non-public:]
+
+  35886f3 Sun Oct 20 22:20:32 2013 -0700
+  Merge pull request #41 from pwendell/shuffle-benchmark
+  [Provide Instrumentation for Shuffle Write Performance]
+
+  5b9380e Sun Oct 20 21:03:51 2013 -0700
+  Merge pull request #89 from rxin/executor
+  [Don't setup the uncaught exception handler in local mode.]
+
+  261bcf2 Sun Oct 20 17:59:51 2013 -0700
+  Merge pull request #80 from rxin/build
+  [Exclusion rules for Maven build files.]
+
+  edc5e3f Sun Oct 20 17:18:06 2013 -0700
+  Merge pull request #75 from JoshRosen/block-manager-cleanup
+  [Code de-duplication in BlockManager]
+
+  2a7ae17 Sun Oct 20 11:45:21 2013 -0700
+  Merge pull request #84 from rxin/kill1
+  [Added documentation for setJobGroup. Also some minor cleanup in SparkContext.]
+
+  e4abb75 Sun Oct 20 09:38:37 2013 -0700
+  Merge pull request #85 from rxin/clean
+  [Moved the top level spark package object from spark to org.apache.spark]
+
+  136b9b3 Sun Oct 20 02:58:26 2013 -0700
+  Basic shuffle file consolidation
+  [The Spark shuffle phase can produce a large number of files, as one file is created]
+
+  747f538 Sat Oct 19 23:40:40 2013 -0700
+  Merge pull request #83 from ewencp/pyspark-accumulator-add-method
+  [Add an add() method to pyspark accumulators.]
+
+  6511bbe Sat Oct 19 11:34:56 2013 -0700
+  Merge pull request #78 from mosharaf/master
+  [Removed BitTorrentBroadcast and TreeBroadcast.]
+
+  f628804 Fri Oct 18 23:19:42 2013 -0700
+  Merge pull request #76 from pwendell/master
+  [Clarify compression property.]
+
+  599dcb0 Fri Oct 18 22:49:00 2013 -0700
+  Merge pull request #74 from rxin/kill
+  [Job cancellation via job group id.]
+
+  9cf43cf Fri Oct 18 22:07:21 2013 -0700
+  Merge pull request #28 from jegonzal/VertexSetRDD
+  [Refactoring IndexedRDD to VertexSetRDD.]
+
+  f888a5b Fri Oct 18 22:06:58 2013 -0700
+  Merge pull request #29 from ankurdave/unit-tests
+  [Unit tests for Graph and GraphOps]
+
+  8de9706 Fri Oct 18 20:32:39 2013 -0700
+  Merge pull request #66 from shivaram/sbt-assembly-deps
+  [Add SBT target to assemble dependencies]
+
+  e5316d0 Fri Oct 18 20:30:56 2013 -0700
+  Merge pull request #68 from mosharaf/master
+  [Faster and stable/reliable broadcast]
+
+  8d528af Fri Oct 18 20:24:10 2013 -0700
+  Merge pull request #71 from aarondav/scdefaults
+  [Spark shell exits if it cannot create SparkContext]
+
+  0794bd7 Fri Oct 18 18:59:58 2013 -0700
+  Merge pull request #27 from jegonzal/removed_indexedrdd_from_core
+  [Removing IndexedRDD changes for spark/core]
+
+  099977f Thu Oct 17 14:17:08 2013 -0700
+  Merge pull request #26 from ankurdave/split-vTableReplicated
+  [Great work!]
+
+  fc26e5b Thu Oct 17 13:21:07 2013 -0700
+  Merge pull request #69 from KarthikTunga/master
+  [Fix for issue SPARK-627. Implementing --config argument in the scripts.]
+
+  cf64f63 Thu Oct 17 11:12:28 2013 -0700
+  Merge pull request #67 from kayousterhout/remove_tsl
+  [Removed TaskSchedulerListener interface.]
+
+  f9973ca Wed Oct 16 15:58:41 2013 -0700
+  Merge pull request #65 from tgravescs/fixYarn
+  [Fix yarn build]
+
+  28e9c2a Tue Oct 15 23:59:56 2013 -0700
+  Merge pull request #63 from pwendell/master
+  [Fixing spark streaming example and a bug in examples build.]
+
+  4e46fde Tue Oct 15 23:14:27 2013 -0700
+  Merge pull request #62 from harveyfeng/master
+  [Make TaskContext's stageId publicly accessible.]
+
+  b534606 Tue Oct 15 21:25:03 2013 -0700
+  Merge pull request #8 from vchekan/checkpoint-ttl-restore
+  [Serialize and restore spark.cleaner.ttl to savepoint]
+
+  6dbd220 Tue Oct 15 19:02:57 2013 -0700
+  Merge pull request #34 from kayousterhout/rename
+  [Renamed StandaloneX to CoarseGrainedX.]
+
+  983b83f Tue Oct 15 19:02:46 2013 -0700
+  Merge pull request #61 from kayousterhout/daemon_thread
+  [Unified daemon thread pools]
+
+  3249e0e Tue Oct 15 14:12:33 2013 -0700
+  Merge pull request #59 from rxin/warning
+  [Bump up logging level to warning for failed tasks.]
+
+  678dec6 Tue Oct 15 10:51:46 2013 -0700
+  Merge pull request #58 from hsaputra/update-pom-asf
+  [Update pom.xml to use version 13 of the ASF parent pom]
+
+  e33b183 Mon Oct 14 22:25:47 2013 -0700
+  Merge pull request #29 from rxin/kill
+  [Job killing]
+
+  3b11f43 Mon Oct 14 14:20:01 2013 -0700
+  Merge pull request #57 from aarondav/bid
+  [Refactor BlockId into an actual type]
+
+  9979690 Sat Oct 12 21:23:26 2013 -0700
+  Merge pull request #52 from harveyfeng/hadoop-closure
+  [Add an optional closure parameter to HadoopRDD instantiation to use when creating local JobConfs.]
+
+  dca8009 Fri Oct 11 16:08:15 2013 -0700
+  Merge pull request #54 from aoiwelle/remove_unused_imports
+  [Remove unnecessary mutable imports]
+
+  0e5052b Fri Oct 11 15:45:16 2013 -0700
+  Merge pull request #51 from ScrapCodes/scala-2.10
+  [Scala 2.10]
+
+  fb25f32 Fri Oct 11 15:44:43 2013 -0700
+  Merge pull request #53 from witgo/master
+  [Add a zookeeper compile dependency to fix build in maven]
+
+  d6ead47 Fri Oct 11 15:43:01 2013 -0700
+  Merge pull request #32 from mridulm/master
+  [Address review comments, move to incubator spark]
+
+  c71499b Thu Oct 10 17:16:42 2013 -0700
+  Merge pull request #19 from aarondav/master-zk
+  [Standalone Scheduler fault tolerance using ZooKeeper]
+
+  5867a82 Thu Oct 10 14:02:37 2013 -0700
+  Merge pull request #19 from dcrankshaw/master
+  [Merge canonical 2d partitioner and group edges into benchmarks]
+
+  cd08f73 Thu Oct 10 13:55:47 2013 -0700
+  Merge pull request #44 from mateiz/fast-map
+  [A fast and low-memory append-only map for shuffle operations]
+
+  4b46d51 Thu Oct 10 13:35:36 2013 -0700
+  Merge pull request #17 from amplab/product2
+  [product 2 change]
+
+  320418f Wed Oct 9 16:55:30 2013 -0700
+  Merge pull request #49 from mateiz/kryo-fix-2
+  [Fix Chill serialization of Range objects]
+
+  215238c Wed Oct 9 16:49:44 2013 -0700
+  Merge pull request #50 from kayousterhout/SPARK-908
+  [Fix race condition in SparkListenerSuite (fixes SPARK-908).]
+
+  7827efc Wed Oct 9 15:07:25 2013 -0700
+  Merge pull request #46 from mateiz/py-sort-update
+  [Fix PySpark docs and an overly long line of code after #38]
+
+  7b3ae04 Wed Oct 9 12:14:19 2013 -0700
+  Merge pull request #45 from pwendell/metrics_units
+  [Use standard abbreviation in metrics description (MBytes -> MB)]
+
+  b4fa11f Wed Oct 9 11:59:47 2013 -0700
+  Merge pull request #38 from AndreSchumacher/pyspark_sorting
+  [SPARK-705: implement sortByKey() in PySpark]
+
+  19d445d Wed Oct 9 11:08:34 2013 -0700
+  Merge pull request #22 from GraceH/metrics-naming
+  [SPARK-900 Use coarser grained naming for metrics]
+
+  7d50f9f Wed Oct 9 10:32:42 2013 -0700
+  Merge pull request #35 from MartinWeindel/scala-2.10
+  [Fixing inconsistencies and warnings on Scala 2.10 branch]
+
+  3218fa7 Tue Oct 8 23:44:55 2013 -0700
+  Merge pull request #4 from MLnick/implicit-als
+  [Adding algorithm for implicit feedback data to ALS]
+
+  e67d5b9 Tue Oct 8 22:57:38 2013 -0700
+  Merge pull request #43 from mateiz/kryo-fix
+  [Don't allocate Kryo buffers unless needed]
+
+  ea34c52 Mon Oct 7 20:45:58 2013 -0700
+  Merge pull request #42 from pwendell/shuffle-read-perf
+  [Fix inconsistent and incorrect log messages in shuffle read path]
+
+  02f37ee Mon Oct 7 15:48:52 2013 -0700
+  Merge pull request #39 from pwendell/master
+  [Adding Shark 0.7.1 to EC2 scripts]
+
+  213b70a Mon Oct 7 10:54:22 2013 -0700
+  Merge pull request #31 from sundeepn/branch-0.8
+  [Resolving package conflicts with hadoop 0.23.9]
+
+  d585613 Sat Oct 5 22:57:05 2013 -0700
+  Merge pull request #37 from pwendell/merge-0.8
+  [merge in remaining changes from `branch-0.8`]
+
+  4a25b11 Sat Oct 5 19:28:55 2013 -0700
+  Merge pull request #20 from harveyfeng/hadoop-config-cache
+  [Allow users to pass broadcasted Configurations and cache InputFormats across Hadoop file reads.]
+
+  8fc68d0 Sat Oct 5 17:24:35 2013 -0700
+  Merge pull request #36 from pwendell/versions
+  [Bumping EC2 default version in master to .]
+
+  100222b Sat Oct 5 13:38:59 2013 -0700
+  Merge pull request #27 from davidmccauley/master
+  [SPARK-920/921 - JSON endpoint updates]
+
+  0864193 Sat Oct 5 13:25:18 2013 -0700
+  Merge pull request #33 from AndreSchumacher/pyspark_partition_key_change
+  [Fixing SPARK-602: PythonPartitioner]
+
+  61ffcde Fri Oct 4 10:52:17 2013 -0700
+  Merge pull request #15 from dcrankshaw/master
+  [Add synthetic generators]
+
+  3fe12cc Fri Oct 4 10:51:28 2013 -0700
+  Merge pull request #946 from ScrapCodes/scala-2.10
+  [Fixed non termination of Executor backend, when sc.stop is not called and system.exit instead.]
+
+  232765f Thu Oct 3 12:00:48 2013 -0700
+  Merge pull request #26 from Du-Li/master
+  [fixed a wildcard bug in make-distribution.sh; ask sbt to check local]
+
+  405e69b Thu Oct 3 10:52:41 2013 -0700
+  Merge pull request #25 from CruncherBigData/master
+  [Update README: updated the link]
+
+  49dbfcc Thu Oct 3 10:52:06 2013 -0700
+  Merge pull request #28 from tgravescs/sparYarnAppName
+  [Allow users to set the application name for Spark on Yarn]
+
+  e597ea3 Wed Oct 2 21:14:24 2013 -0700
+  Merge pull request #10 from kayousterhout/results_through-bm
+  [Send Task results through the block manager when larger than Akka frame size (fixes SPARK-669).]
+
+  714fdab Thu Sep 26 14:28:55 2013 -0700
+  Merge pull request #17 from rxin/optimize
+  [Remove -optimize flag]
+
+  13eced7 Thu Sep 26 14:18:19 2013 -0700
+  Merge pull request #16 from pwendell/master
+  [Bug fix in master build]
+
+  70a0b99 Thu Sep 26 14:11:54 2013 -0700
+  Merge pull request #14 from kayousterhout/untangle_scheduler
+  [Improved organization of scheduling packages.]
+
+  afd03b2 Thu Sep 26 14:09:55 2013 -0700
+  Merge pull request #943 from ScrapCodes/scala-2.10
+  [Scala 2.10 with akka 2.2]
+
+  76677b8 Thu Sep 26 14:03:46 2013 -0700
+  Merge pull request #670 from jey/ec2-ssh-improvements
+  [EC2 SSH improvements]
+
+  c514cd1 Thu Sep 26 13:48:20 2013 -0700
+  Merge pull request #930 from holdenk/master
+  [Add mapPartitionsWithIndex]
+
+  560ee5c Thu Sep 26 11:27:34 2013 -0700
+  Merge pull request #7 from wannabeast/memorystore-fixes
+  [some minor fixes to MemoryStore]
+
+  6566a19 Thu Sep 26 08:01:04 2013 -0700
+  Merge pull request #9 from rxin/limit
+  [Smarter take/limit implementation.]
+
+  834686b Sun Sep 22 15:06:48 2013 -0700
+  Merge pull request #928 from jerryshao/fairscheduler-refactor
+  [Refactor FairSchedulableBuilder]
+
+  a2ea069 Sat Sep 21 23:04:42 2013 -0700
+  Merge pull request #937 from jerryshao/localProperties-fix
+  [Fix PR926 local properties issues in Spark Streaming like scenarios]
+
+  f06f2da Sat Sep 21 22:43:34 2013 -0700
+  Merge pull request #941 from ilikerps/master
+  [Add "org.apache." prefix to packages in spark-class]
+
+  7bb12a2 Sat Sep 21 22:42:46 2013 -0700
+  Merge pull request #940 from ankurdave/clear-port-properties-after-tests
+  [After unit tests, clear port properties unconditionally]
+
+  a00317b Fri Sep 20 11:29:31 2013 -0700
+  Merge pull request #1 from ankurdave/aggregateNeighbors-returns-graph
+  [Return Graph from Graph.aggregateNeighbors]
+
+  6a5e665 Thu Sep 19 22:41:44 2013 -0700
+  Merge pull request #3 from ankurdave/clear-port-properties-after-tests
+  [After unit tests, clear port properties unconditionally]
+
+  68ad33a Thu Sep 19 21:30:27 2013 -0700
+  Merge pull request #2 from ankurdave/package-fixes
+  [Package fixes (spark.graph -> org.apache.spark.graph)]
+
+  cd7222c Thu Sep 19 14:21:24 2013 -0700
+  Merge pull request #938 from ilikerps/master
+  [Fix issue with spark_ec2 seeing empty security groups]
+
+  e0dd24d Sat Aug 31 17:54:15 2013 -0700
+  Merge pull request #879 from AndreSchumacher/scala-2.10
+  [PySpark: replacing class manifest by class tag for Scala 2.10.2 in rdd.py]
+
+  ad61349 Thu Jul 18 13:53:48 2013 -0700
+  Merge pull request #709 from ScrapCodes/scala-2.10
+  [Fixed warnings in scala 2.10 branch.]
+
+  a289ded Mon Jul 15 15:59:43 2013 -0700
+  Merge pull request #700 from ScrapCodes/scala-2.10
+  [Scala 2.10 ]
+
+  1044a95 Fri Jun 14 20:04:24 2013 -0700
+  Merge pull request #652 from ScrapCodes/scala-2.10
+  [Fixed maven build without netty fix]
+
+  4b57f83 Sat Apr 20 10:40:07 2013 -0700
+  Merge pull request #535 from ScrapCodes/scala-2.10-repl-port
+  [porting of repl to scala-2.10]
+
+  73b3fee Sun Jan 20 10:11:49 2013 -0800
+  Merge pull request #388 from folone/master
+  [Updated maven build configuration for Scala 2.10]
+
+  20adf27 Tue Jan 15 11:03:49 2013 -0800
+  Merge pull request #371 from folone/master
+  [Scala 2.10.0]
+
+Release 0.8.0-incubating
+
+  2aff798 Sun Sep 15 14:05:04 2013 -0700
+  Merge pull request #933 from jey/yarn-typo-fix
+  [Fix typo in Maven build docs]
+
+  dbd2c4f Sun Sep 15 13:20:41 2013 -0700
+  Merge pull request #932 from pwendell/mesos-version
+  [Bumping Mesos version to 0.13.0]
+
+  9fb0b9d Sun Sep 15 13:02:53 2013 -0700
+  Merge pull request #931 from pwendell/yarn-docs
+  [Explain yarn.version in Maven build docs]
+
+  c4c1db2 Fri Sep 13 19:52:12 2013 -0700
+  Merge pull request #929 from pwendell/master
+  [Use different Hadoop version for YARN artifacts.]
+
+  a310de6 Wed Sep 11 19:36:11 2013 -0700
+  Merge pull request #926 from kayousterhout/dynamic
+  [Changed localProperties to use ThreadLocal (not DynamicVariable).]
+
+  58c7d8b Wed Sep 11 17:33:42 2013 -0700
+  Merge pull request #927 from benh/mesos-docs
+  [Updated Spark on Mesos documentation.]
+
+  91a59e6 Wed Sep 11 10:21:48 2013 -0700
+  Merge pull request #919 from mateiz/jets3t
+  [Add explicit jets3t dependency, which is excluded in hadoop-client]
+
+  b9128d3 Wed Sep 11 10:03:06 2013 -0700
+  Merge pull request #922 from pwendell/port-change
+  [Change default port number from 3030 to 4030.]
+
+  e07eef8 Wed Sep 11 07:35:39 2013 -0700
+  Merge pull request #925 from davidmccauley/master
+  [SPARK-894 - Not all WebUI fields delivered VIA JSON]
+
+  8432f27 Tue Sep 10 23:19:53 2013 -0700
+  Merge pull request #923 from haoyuan/master
+  [fix run-example script]
+
+  d40f140 Tue Sep 10 23:05:29 2013 -0700
+  Merge pull request #921 from pwendell/master
+  [Fix HDFS access bug with assembly build.]
+
+  0a6c051 Mon Sep 9 23:37:57 2013 -0700
+  Merge pull request #918 from pwendell/branch-0.8
+  [Update versions for 0.8.0 release.]
+
+  8c14f4b Mon Sep 9 22:07:58 2013 -0700
+  Merge pull request #917 from pwendell/master
+  [Document libgfortran dependency for MLBase]
+
+  c81377b Mon Sep 9 20:16:19 2013 -0700
+  Merge pull request #915 from ooyala/master
+  [Get rid of / improve ugly NPE when Utils.deleteRecursively() fails]
+
+  61d2a01 Mon Sep 9 18:21:01 2013 -0700
+  Merge pull request #916 from mateiz/mkdist-fix
+  [Fix copy issue in https://github.com/mesos/spark/pull/899]
+
+  a85758c Mon Sep 9 13:45:40 2013 -0700
+  Merge pull request #907 from stephenh/document_coalesce_shuffle
+  [Add better docs for coalesce.]
+
+  084fc36 Mon Sep 9 12:01:35 2013 -0700
+  Merge pull request #912 from tgravescs/ganglia-pom
+  [Add metrics-ganglia to core pom file]
+
+  0456384 Mon Sep 9 09:57:54 2013 -0700
+  Merge pull request #911 from pwendell/ganglia-sink
+  [Adding Manen dependency for Ganglia]
+
+  bf984e2 Sun Sep 8 23:50:24 2013 -0700
+  Merge pull request #890 from mridulm/master
+  [Fix hash bug]
+
+  e9d4f44 Sun Sep 8 23:36:48 2013 -0700
+  Merge pull request #909 from mateiz/exec-id-fix
+  [Fix an instance where full standalone mode executor IDs were passed to]
+
+  2447b1c Sun Sep 8 22:27:49 2013 -0700
+  Merge pull request #910 from mateiz/ml-doc-tweaks
+  [Small tweaks to MLlib docs]
+
+  7d3204b Sun Sep 8 21:39:12 2013 -0700
+  Merge pull request #905 from mateiz/docs2
+  [Job scheduling and cluster mode docs]
+
+  f1f8371 Sun Sep 8 21:26:11 2013 -0700
+  Merge pull request #896 from atalwalkar/master
+  [updated content]
+
+  f68848d Sun Sep 8 18:32:16 2013 -0700
+  Merge pull request #906 from pwendell/ganglia-sink
+  [Clean-up of Metrics Code/Docs and Add Ganglia Sink]
+
+  0b95799 Sun Sep 8 15:30:16 2013 -0700
+  Merge pull request #908 from pwendell/master
+  [Fix target JVM version in scala build]
+
+  04cfb3a Sun Sep 8 10:33:20 2013 -0700
+  Merge pull request #898 from ilikerps/660
+  [SPARK-660: Add StorageLevel support in Python]
+
+  38488ac Sun Sep 8 00:28:53 2013 -0700
+  Merge pull request #900 from pwendell/cdh-docs
+  [Provide docs to describe running on CDH/HDP cluster.]
+
+  a8e376e Sat Sep 7 21:16:01 2013 -0700
+  Merge pull request #904 from pwendell/master
+  [Adding Apache license to two files]
+
+  cfde85e Sat Sep 7 13:53:08 2013 -0700
+  Merge pull request #901 from ooyala/2013-09/0.8-doc-changes
+  [0.8 Doc changes for make-distribution.sh]
+
+  4a7813a Sat Sep 7 13:52:24 2013 -0700
+  Merge pull request #903 from rxin/resulttask
+  [Fixed the bug that ResultTask was not properly deserializing outputId.]
+
+  afe46ba Sat Sep 7 07:28:51 2013 -0700
+  Merge pull request #892 from jey/fix-yarn-assembly
+  [YARN build fixes]
+
+  2eebeff Fri Sep 6 15:25:22 2013 -0700
+  Merge pull request #897 from pwendell/master
+  [Docs describing Spark monitoring and instrumentation]
+
+  ddcb9d3 Thu Sep 5 23:54:09 2013 -0700
+  Merge pull request #895 from ilikerps/821
+  [SPARK-821: Don't cache results when action run locally on driver]
+
+  699c331 Thu Sep 5 20:21:53 2013 -0700
+  Merge pull request #891 from xiajunluan/SPARK-864
+  [[SPARK-864]DAGScheduler Exception if we delete Worker and StandaloneExecutorBackend then add Worker]
+
+  5c7494d Wed Sep 4 22:47:03 2013 -0700
+  Merge pull request #893 from ilikerps/master
+  [SPARK-884: Add unit test to validate Spark JSON output]
+
+  a547866 Wed Sep 4 21:11:56 2013 -0700
+  Merge pull request #894 from c0s/master
+  [Updating assembly README to reflect recent changes in the build.]
+
+  19f7027 Tue Sep 3 14:29:10 2013 -0700
+  Merge pull request #878 from tgravescs/yarnUILink
+  [Link the Spark UI up to the Yarn UI ]
+
+  68df246 Tue Sep 3 13:01:17 2013 -0700
+  Merge pull request #889 from alig/master
+  [Return the port the WebUI is bound to (useful if port 0 was used)]
+
+  d3dd48f Mon Sep 2 16:44:54 2013 -0700
+  Merge pull request #887 from mateiz/misc-fixes
+  [Miscellaneous fixes for 0.8]
+
+  636fc0c Mon Sep 2 11:20:39 2013 -0700
+  Merge pull request #886 from mateiz/codec
+  [Fix spark.io.compression.codec and change default codec to LZF]
+
+  d9a53b9 Sun Sep 1 22:12:30 2013 -0700
+  Merge pull request #885 from mateiz/win-py
+  [Allow PySpark to run on Windows]
+
+  3c520fe Sun Sep 1 17:26:55 2013 -0700
+  Merge pull request #884 from mateiz/win-fixes
+  [Run script fixes for Windows after package & assembly change]
+
+  f957c26 Sun Sep 1 14:53:57 2013 -0700
+  Merge pull request #882 from mateiz/package-rename
+  [Rename spark package to org.apache.spark]
+
+  a30fac1 Sun Sep 1 12:27:50 2013 -0700
+  Merge pull request #883 from alig/master
+  [Don't require the spark home environment variable to be set for standalone mode (change needed by SIMR)]
+
+  03cc765 Sun Sep 1 10:20:56 2013 -0700
+  Merge pull request #881 from pwendell/master
+  [Extend QuickStart to include next steps]
+
+  0e9565a Sat Aug 31 18:55:41 2013 -0700
+  Merge pull request #880 from mateiz/ui-tweaks
+  [Various UI tweaks]
+
+  2b29a1d Sat Aug 31 17:49:45 2013 -0700
+  Merge pull request #877 from mateiz/docs
+  [Doc improvements for 0.8]
+
+  6edef9c Sat Aug 31 13:39:24 2013 -0700
+  Merge pull request #861 from AndreSchumacher/pyspark_sampling_function
+  [Pyspark sampling function]
+
+  fd89835 Sat Aug 31 13:18:12 2013 -0700
+  Merge pull request #870 from JoshRosen/spark-885
+  [Don't send SIGINT / ctrl-c to Py4J gateway subprocess]
+
+  618f0ec Fri Aug 30 18:17:13 2013 -0700
+  Merge pull request #869 from AndreSchumacher/subtract
+  [PySpark: implementing subtractByKey(), subtract() and keyBy()]
+
+  94bb7fd Fri Aug 30 12:05:13 2013 -0700
+  Merge pull request #876 from mbautin/master_hadoop_rdd_conf
+  [Make HadoopRDD's configuration accessible]
+
+  9e17e45 Fri Aug 30 00:22:53 2013 -0700
+  Merge pull request #875 from shivaram/build-fix
+  [Fix broken build by removing addIntercept]
+
+  016787d Thu Aug 29 22:15:14 2013 -0700
+  Merge pull request #863 from shivaram/etrain-ridge
+  [Adding linear regression and refactoring Ridge regression to use SGD]
+
+  852d810 Thu Aug 29 22:13:15 2013 -0700
+  Merge pull request #819 from shivaram/sgd-cleanup
+  [Change SVM to use {0,1} labels]
+
+  ca71620 Thu Aug 29 21:51:14 2013 -0700
+  Merge pull request #857 from mateiz/assembly
+  [Change build and run instructions to use assemblies]
+
+  1528776 Thu Aug 29 21:30:47 2013 -0700
+  Merge pull request #874 from jerryshao/fix-report-bug
+  [Fix removed block zero size log reporting]
+
+  abdbacf Wed Aug 28 21:11:31 2013 -0700
+  Merge pull request #871 from pwendell/expose-local
+  [Expose `isLocal` in SparkContext.]
+
+  afcade3 Wed Aug 28 20:15:40 2013 -0700
+  Merge pull request #873 from pwendell/master
+  [Hot fix for command runner]
+
+  baa84e7 Wed Aug 28 12:44:46 2013 -0700
+  Merge pull request #865 from tgravescs/fixtmpdir
+  [Spark on Yarn should use yarn approved directories for spark.local.dir and tmp]
+
+  cd043cf Tue Aug 27 19:50:32 2013 -0700
+  Merge pull request #867 from tgravescs/yarnenvconfigs
+  [Spark on Yarn allow users to specify environment variables ]
+
+  898da7e Mon Aug 26 20:40:49 2013 -0700
+  Merge pull request #859 from ianbuss/sbt_opts
+  [Pass SBT_OPTS environment through to sbt_launcher]
+
+  17bafea Mon Aug 26 11:59:32 2013 -0700
+  Merge pull request #864 from rxin/json1
+  [Revert json library change]
+
+  f9fc5c1 Sat Aug 24 15:19:56 2013 -0700
+  Merge pull request #603 from pwendell/ec2-updates
+  [Several Improvements to EC2 Scripts]
+
+  d282c1e Fri Aug 23 11:20:20 2013 -0700
+  Merge pull request #860 from jey/sbt-ide-fixes
+  [Fix IDE project generation under SBT]
+
+  5a6ac12 Thu Aug 22 22:08:03 2013 -0700
+  Merge pull request #701 from ScrapCodes/documentation-suggestions
+  [Documentation suggestions for spark streaming.]
+
+  46ea0c1 Thu Aug 22 15:57:28 2013 -0700
+  Merge pull request #814 from holdenk/master
+  [Create less instances of the random class during ALS initialization.]
+
+  9ac3d62 Thu Aug 22 15:51:10 2013 -0700
+  Merge pull request #856 from jey/sbt-fix-hadoop-0.23.9
+  [Re-add removed dependency to fix build under Hadoop 0.23.9]
+
+  ae8ba83 Thu Aug 22 10:14:54 2013 -0700
+  Merge pull request #855 from jey/update-build-docs
+  [Update build docs]
+
+  8a36fd0 Thu Aug 22 10:13:35 2013 -0700
+  Merge pull request #854 from markhamstra/pomUpdate
+  [Synced sbt and maven builds to use the same dependencies, etc.]
+
+  c2d00f1 Thu Aug 22 10:13:03 2013 -0700
+  Merge pull request #832 from alig/coalesce
+  [Coalesced RDD with locality]
+
+  e6d66c8 Wed Aug 21 17:44:31 2013 -0700
+  Merge pull request #853 from AndreSchumacher/double_rdd
+  [Implementing SPARK-838: Add DoubleRDDFunctions methods to PySpark]
+
+  2905611 Tue Aug 20 17:36:14 2013 -0700
+  Merge pull request #851 from markhamstra/MutablePairTE
+  [Removed meaningless types]
+
+  d61337f Tue Aug 20 10:06:06 2013 -0700
+  Merge pull request #844 from markhamstra/priorityRename
+  [Renamed 'priority' to 'jobId' and assorted minor changes]
+
+  8cae72e Mon Aug 19 23:40:04 2013 -0700
+  Merge pull request #828 from mateiz/sched-improvements
+  [Scheduler fixes and improvements]
+
+  efeb142 Mon Aug 19 19:23:50 2013 -0700
+  Merge pull request #849 from mateiz/web-fixes
+  [Small fixes to web UI]
+
+  abdc1f8 Mon Aug 19 18:30:56 2013 -0700
+  Merge pull request #847 from rxin/rdd
+  [Allow subclasses of Product2 in all key-value related classes]
+
+  8fa0747 Sun Aug 18 17:02:54 2013 -0700
+  Merge pull request #840 from AndreSchumacher/zipegg
+  [Implementing SPARK-878 for PySpark: adding zip and egg files to context ...]
+
+  1e137a5 Sat Aug 17 22:22:32 2013 -0700
+  Merge pull request #846 from rxin/rdd
+  [Two minor RDD refactoring]
+
+  e89ffc7 Fri Aug 16 14:02:34 2013 -0700
+  Merge pull request #839 from jegonzal/zip_partitions
+  [Currying RDD.zipPartitions ]
+
+  1fb1b09 Thu Aug 15 22:15:05 2013 -0700
+  Merge pull request #841 from rxin/json
+  [Use the JSON formatter from Scala library and removed dependency on lift-json.]
+
+  c69c489 Thu Aug 15 20:55:09 2013 -0700
+  Merge pull request #843 from Reinvigorate/bug-879
+  [fixing typo in conf/slaves]
+
+  230ab27 Thu Aug 15 17:45:17 2013 -0700
+  Merge pull request #834 from Daemoen/master
+  [Updated json output to allow for display of worker state]
+
+  659553b Thu Aug 15 16:56:31 2013 -0700
+  Merge pull request #836 from pwendell/rename
+  [Rename `memoryBytesToString` and `memoryMegabytesToString`]
+
+  28369ff Thu Aug 15 16:44:02 2013 -0700
+  Merge pull request #829 from JoshRosen/pyspark-unit-tests-python-2.6
+  [Fix PySpark unit tests on Python 2.6]
+
+  1a13460 Thu Aug 15 15:50:44 2013 -0700
+  Merge pull request #833 from rxin/ui
+  [Various UI improvements.]
+
+  044a088 Wed Aug 14 20:43:49 2013 -0700
+  Merge pull request #831 from rxin/scheduler
+  [A few small scheduler / job description changes.]
+
+  839f2d4 Wed Aug 14 16:17:23 2013 -0700
+  Merge pull request #822 from pwendell/ui-features
+  [Adding GC Stats to TaskMetrics (and three small fixes)]
+
+  63446f9 Wed Aug 14 00:17:07 2013 -0700
+  Merge pull request #826 from kayousterhout/ui_fix
+  [Fixed 2 bugs in executor UI (incl. SPARK-877)]
+
+  3f14cba Tue Aug 13 20:09:51 2013 -0700
+  Merge pull request #825 from shivaram/maven-repl-fix
+  [Set SPARK_CLASSPATH for maven repl tests]
+
+  596adc6 Tue Aug 13 19:41:34 2013 -0700
+  Merge pull request #824 from mateiz/mesos-0.12.1
+  [Update to Mesos 0.12.1]
+
+  d316af9 Tue Aug 13 15:31:01 2013 -0700
+  Merge pull request #821 from pwendell/print-launch-command
+  [Print run command to stderr rather than stdout]
+
+  1f79d21 Tue Aug 13 15:23:54 2013 -0700
+  Merge pull request #818 from kayousterhout/killed_fix
+  [Properly account for killed tasks.]
+
+  622f83c Tue Aug 13 09:58:52 2013 -0700
+  Merge pull request #817 from pwendell/pr_784
+  [Minor clean-up in metrics servlet code]
+
+  a0133bf Tue Aug 13 09:28:18 2013 -0700
+  Merge pull request #784 from jerryshao/dev-metrics-servlet
+  [Add MetricsServlet for Spark metrics system]
+
+  e2fdac6 Mon Aug 12 21:26:59 2013 -0700
+  Merge pull request #802 from stayhf/SPARK-760-Python
+  [Simple PageRank algorithm implementation in Python for SPARK-760]
+
+  d3525ba Mon Aug 12 21:02:39 2013 -0700
+  Merge pull request #813 from AndreSchumacher/add_files_pyspark
+  [Implementing SPARK-865: Add the equivalent of ADD_JARS to PySpark]
+
+  9e02da2 Mon Aug 12 20:22:27 2013 -0700
+  Merge pull request #812 from shivaram/maven-mllib-tests
+  [Create SparkContext in beforeAll for MLLib tests]
+
+  65d0d91 Mon Aug 12 19:00:57 2013 -0700
+  Merge pull request #807 from JoshRosen/guava-optional
+  [Change scala.Option to Guava Optional in Java APIs]
+
+  4346f0a Mon Aug 12 12:12:12 2013 -0700
+  Merge pull request #809 from shivaram/sgd-cleanup
+  [Clean up scaladoc in ML Lib.]
+
+  ea1b4ba Mon Aug 12 08:09:58 2013 -0700
+  Merge pull request #806 from apivovarov/yarn-205
+  [Changed yarn.version to 2.0.5 in pom.xml]
+
+  2a39d2c Sun Aug 11 20:35:09 2013 -0700
+  Merge pull request #810 from pwendell/dead_doc_code
+  [Remove now dead code inside of docs]
+
+  e5b9ed2 Sun Aug 11 17:22:47 2013 -0700
+  Merge pull request #808 from pwendell/ui_compressed_bytes
+  [Report compressed bytes read when calculating TaskMetrics]
+
+  3796486 Sun Aug 11 14:51:47 2013 -0700
+  Merge pull request #805 from woggle/hadoop-rdd-jobconf
+  [Use new Configuration() instead of slower new JobConf() in SerializableWritable]
+
+  ff9ebfa Sun Aug 11 10:52:55 2013 -0700
+  Merge pull request #762 from shivaram/sgd-cleanup
+  [Refactor SGD options into a new class.]
+
+  95c62ca Sun Aug 11 10:30:52 2013 -0700
+  Merge pull request #804 from apivovarov/master
+  [Fixed path to JavaALS.java and JavaKMeans.java, fixed hadoop2-yarn profi...]
+
+  06e4f2a Sat Aug 10 18:06:23 2013 -0700
+  Merge pull request #789 from MLnick/master
+  [Adding Scala version of PageRank example]
+
+  71c63de Sat Aug 10 10:21:20 2013 -0700
+  Merge pull request #795 from mridulm/master
+  [Fix bug reported in PR 791 : a race condition in ConnectionManager and Connection]
+
+  d17eeb9 Sat Aug 10 09:02:27 2013 -0700
+  Merge pull request #785 from anfeng/master
+  [expose HDFS file system stats via Executor metrics]
+
+  dce5e47 Fri Aug 9 21:53:45 2013 -0700
+  Merge pull request #800 from dlyubimov/HBASE_VERSION
+  [Pull HBASE_VERSION in the head of sbt build]
+
+  cd247ba Fri Aug 9 20:41:13 2013 -0700
+  Merge pull request #786 from shivaram/mllib-java
+  [Java fixes, tests and examples for ALS, KMeans]
+
+  b09d4b7 Fri Aug 9 13:17:08 2013 -0700
+  Merge pull request #799 from woggle/sync-fix
+  [Remove extra synchronization in ResultTask]
+
+  0bc63bf Fri Aug 9 13:16:25 2013 -0700
+  Merge pull request #801 from pwendell/print-launch-command
+  [Print launch command [Branch 0.8 version]]
+
+  cc6b92e Fri Aug 9 13:00:33 2013 -0700
+  Merge pull request #775 from pwendell/print-launch-command
+  [Log the launch command for Spark daemons]
+
+  f94fc75 Fri Aug 9 10:04:03 2013 -0700
+  Merge pull request #788 from shane-huang/sparkjavaopts
+  [For standalone mode, add worker local env setting of SPARK_JAVA_OPTS as ...]
+
+  63b6e02 Thu Aug 8 14:02:02 2013 -0700
+  Merge pull request #797 from mateiz/chill-0.3.1
+  [Update to Chill 0.3.1]
+
+  9955e5a Thu Aug 8 11:03:38 2013 -0700
+  Merge pull request #796 from pwendell/bootstrap-design
+  [Bootstrap re-design]
+
+  5133e4b Wed Aug 7 15:50:45 2013 -0700
+  Merge pull request #790 from kayousterhout/fix_throughput
+  [Fixed issue in UI that decreased scheduler throughput by 5x or more]
+
+  3c8478e Tue Aug 6 23:25:03 2013 -0700
+  Merge pull request #747 from mateiz/improved-lr
+  [Update the Python logistic regression example]
+
+  6b043a6 Tue Aug 6 22:31:02 2013 -0700
+  Merge pull request #724 from dlyubimov/SPARK-826
+  [SPARK-826: fold(), reduce(), collect() always attempt to use java serialization]
+
+  de6c4c9 Tue Aug 6 17:09:50 2013 -0700
+  Merge pull request #787 from ash211/master
+  [Update spark-standalone.md]
+
+  df4d10d Tue Aug 6 15:44:05 2013 -0700
+  Merge pull request #779 from adatao/adatao-global-SparkEnv
+  [[HOTFIX] Extend thread safety for SparkEnv.get()]
+
+  d2b0f0c Tue Aug 6 14:49:39 2013 -0700
+  Merge pull request #770 from stayhf/SPARK-760-Java
+  [Simple PageRank algorithm implementation in Java for SPARK-760]
+
+  d031f73 Mon Aug 5 22:33:00 2013 -0700
+  Merge pull request #782 from WANdisco/master
+  [SHARK-94 Log the files computed by HadoopRDD and NewHadoopRDD]
+
+  1b63dea Mon Aug 5 22:21:26 2013 -0700
+  Merge pull request #769 from markhamstra/NegativeCores
+  [SPARK-847 + SPARK-845: Zombie workers and negative cores]
+
+  828aff7 Mon Aug 5 21:37:33 2013 -0700
+  Merge pull request #776 from gingsmith/master
+  [adding matrix factorization data generator]
+
+  8b27789 Mon Aug 5 19:14:52 2013 -0700
+  Merge pull request #774 from pwendell/job-description
+  [Show user-defined job name in UI]
+
+  550b0cf Mon Aug 5 12:10:32 2013 -0700
+  Merge pull request #780 from cybermaster/master
+  [SPARK-850]
+
+  22abbc1 Fri Aug 2 16:37:59 2013 -0700
+  Merge pull request #772 from karenfeng/ui-843
+  [Show app duration]
+
+  9d7dfd2 Thu Aug 1 17:41:58 2013 -0700
+  Merge pull request #743 from pwendell/app-metrics
+  [Add application metrics to standalone master]
+
+  6d7afd7 Thu Aug 1 17:13:28 2013 -0700
+  Merge pull request #768 from pwendell/pr-695
+  [Minor clean-up of fair scheduler UI]
+
+  5e7b38f Thu Aug 1 14:59:33 2013 -0700
+  Merge pull request #695 from xiajunluan/pool_ui
+  [Enhance job ui in spark ui system with adding pool information]
+
+  0a96493 Thu Aug 1 11:27:17 2013 -0700
+  Merge pull request #760 from karenfeng/heading-update
+  [Clean up web UI page headers]
+
+  cb7dd86 Thu Aug 1 11:06:10 2013 -0700
+  Merge pull request #758 from pwendell/master-json
+  [Add JSON path to master index page]
+
+  58756b7 Wed Jul 31 23:45:41 2013 -0700
+  Merge pull request #761 from mateiz/kmeans-generator
+  [Add data generator for K-means]
+
+  ecab635 Wed Jul 31 18:16:55 2013 -0700
+  Merge pull request #763 from c0s/assembly
+  [SPARK-842. Maven assembly is including examples libs and dependencies]
+
+  39c75f3 Wed Jul 31 15:52:36 2013 -0700
+  Merge pull request #757 from BlackNiuza/result_task_generation
+  [Bug fix: SPARK-837]
+
+  b2b86c2 Wed Jul 31 15:51:39 2013 -0700
+  Merge pull request #753 from shivaram/glm-refactor
+  [Build changes for ML lib]
+
+  14bf2fe Wed Jul 31 14:18:16 2013 -0700
+  Merge pull request #749 from benh/spark-executor-uri
+  [Added property 'spark.executor.uri' for launching on Mesos.]
+
+  4ba4c3f Wed Jul 31 13:14:49 2013 -0700
+  Merge pull request #759 from mateiz/split-fix
+  [Use the Char version of split() instead of the String one in MLUtils]
+
+  a386ced Wed Jul 31 11:22:50 2013 -0700
+  Merge pull request #754 from rxin/compression
+  [Compression codec change]
+
+  0be071a Wed Jul 31 11:11:59 2013 -0700
+  Merge pull request #756 from cdshines/patch-1
+  [Refactored Vector.apply(length, initializer) replacing excessive code with library method]
+
+  d4556f4 Wed Jul 31 08:48:14 2013 -0700
+  Merge pull request #751 from cdshines/master
+  [Cleaned Partitioner & PythonPartitioner source by taking out non-related logic to Utils]
+
+  29b8cd3 Tue Jul 30 21:30:33 2013 -0700
+  Merge pull request #755 from jerryshao/add-apache-header
+  [Add Apache license header to metrics system]
+
+  e87de03 Tue Jul 30 15:00:08 2013 -0700
+  Merge pull request #744 from karenfeng/bootstrap-update
+  [Use Bootstrap progress bars in web UI]
+
+  ae57020 Tue Jul 30 14:56:41 2013 -0700
+  Merge pull request #752 from rxin/master
+  [Minor mllib cleanup]
+
+  8aee118 Tue Jul 30 10:27:54 2013 -0700
+  Merge pull request #748 from atalwalkar/master
+  [made SimpleUpdater consistent with other updaters]
+
+  468a36c Mon Jul 29 19:44:33 2013 -0700
+  Merge pull request #746 from rxin/cleanup
+  [Internal cleanup]
+
+  1e1ffb1 Mon Jul 29 19:26:19 2013 -0700
+  Merge pull request #745 from shivaram/loss-update-fix
+  [Remove duplicate loss history in Gradient Descent]
+
+  c99b674 Mon Jul 29 16:32:55 2013 -0700
+  Merge pull request #735 from karenfeng/ui-807
+  [Totals for shuffle data and CPU time]
+
+  fe7298b Mon Jul 29 14:01:00 2013 -0700
+  Merge pull request #741 from pwendell/usability
+  [Fix two small usability issues]
+
+  c34c0f6 Mon Jul 29 13:18:10 2013 -0700
+  Merge pull request #731 from pxinghao/master
+  [Adding SVM and Lasso]
+
+  f3d72ff Fri Jul 26 17:19:27 2013 -0700
+  Merge pull request #739 from markhamstra/toolsPom
+  [Missing tools/pom.xml scalatest dependency]
+
+  cb36677 Fri Jul 26 16:59:30 2013 -0700
+  Merge pull request #738 from harsha2010/pruning
+  [Fix bug in Partition Pruning.]
+
+  f3cf094 Thu Jul 25 14:53:21 2013 -0700
+  Merge pull request #734 from woggle/executor-env2
+  [Get more env vars from driver rather than worker]
+
+  51c2427 Thu Jul 25 00:03:11 2013 -0700
+  Merge pull request #732 from ryanlecompte/master
+  [Refactor Kryo serializer support to use chill/chill-java]
+
+  52723b9 Wed Jul 24 14:33:02 2013 -0700
+  Merge pull request #728 from jey/examples-jar-env
+  [Fix setting of SPARK_EXAMPLES_JAR]
+
+  20338c2 Wed Jul 24 14:32:24 2013 -0700
+  Merge pull request #729 from karenfeng/ui-811
+  [Stage Page updates]
+
+  5584ebc Wed Jul 24 11:46:46 2013 -0700
+  Merge pull request #675 from c0s/assembly
+  [Building spark assembly for further consumption of the Spark project with a deployed cluster]
+
+  a73f3ee Wed Jul 24 08:59:14 2013 -0700
+  Merge pull request #671 from jerryshao/master
+  [Add metrics system for Spark]
+
+  b011329 Tue Jul 23 22:50:09 2013 -0700
+  Merge pull request #727 from rxin/scheduler
+  [Scheduler code style cleanup.]
+
+  876125b Tue Jul 23 22:28:21 2013 -0700
+  Merge pull request #726 from rxin/spark-826
+  [SPARK-829: scheduler shouldn't hang if a task contains unserializable objects in its closure]
+
+  2f1736c Tue Jul 23 15:53:30 2013 -0700
+  Merge pull request #725 from karenfeng/task-start
+  [Creates task start events]
+
+  5364f64 Tue Jul 23 13:40:34 2013 -0700
+  Merge pull request #723 from rxin/mllib
+  [Made RegressionModel serializable and added unit tests to make sure predict methods would work.]
+
+  f369e0e Tue Jul 23 13:22:27 2013 -0700
+  Merge pull request #720 from ooyala/2013-07/persistent-rdds-api
+  [Add a public method getCachedRdds to SparkContext]
+
+  401aac8 Mon Jul 22 16:57:16 2013 -0700
+  Merge pull request #719 from karenfeng/ui-808
+  [Creates Executors tab for Jobs UI]
+
+  8ae1436 Mon Jul 22 16:03:04 2013 -0700
+  Merge pull request #722 from JoshRosen/spark-825
+  [Fix bug: DoubleRDDFunctions.sampleStdev() computed non-sample stdev()]
+
+  15fb394 Sun Jul 21 10:33:38 2013 -0700
+  Merge pull request #716 from c0s/webui-port
+  [Regression: default webui-port can't be set via command line "--webui-port" anymore]
+
+  c40f0f2 Fri Jul 19 13:33:04 2013 -0700
+  Merge pull request #711 from shivaram/ml-generators
+  [Move ML lib data generator files to util/]
+
+  413b841 Fri Jul 19 13:31:38 2013 -0700
+  Merge pull request #717 from viirya/dev1
+  [Do not copy local jars given to SparkContext in yarn mode]
+
+  0d0a47c Thu Jul 18 12:06:37 2013 -0700
+  Merge pull request #710 from shivaram/ml-updates
+  [Updates to LogisticRegression]
+
+  c6235b5 Thu Jul 18 11:43:48 2013 -0700
+  Merge pull request #714 from adatao/master
+  [[BUGFIX]  Fix for sbt/sbt script SPARK_HOME setting]
+
+  009c79e Thu Jul 18 11:41:52 2013 -0700
+  Merge pull request #715 from viirya/dev1
+  [fix a bug in build process that pulls in two versions of ASM.]
+
+  985a9e3 Wed Jul 17 22:27:19 2013 -0700
+  Merge pull request #712 from stayhf/SPARK-817
+  [Consistently invoke bash with /usr/bin/env bash in scripts to make code ...]
+
+  cad48ed Tue Jul 16 21:41:28 2013 -0700
+  Merge pull request #708 from ScrapCodes/dependencies-upgrade
+  [Dependency upgrade Akka 2.0.3 -> 2.0.5]
+
+  8a8a8f2 Mon Jul 15 23:09:21 2013 -0700
+  Merge pull request #705 from rxin/errormessages
+  [Throw a more meaningful message when runJob is called to launch tasks on non-existent partitions.]
+
+  ed8415b Mon Jul 15 16:41:04 2013 -0700
+  Merge pull request #703 from karenfeng/ui-802
+  [Link to job UI from standalone deploy cluster web UI]
+
+  e3d3e6f Mon Jul 15 14:59:44 2013 -0700
+  Merge pull request #702 from karenfeng/ui-fixes
+  [Adds app name in HTML page titles on job web UI]
+
+  c7877d5 Sun Jul 14 12:58:13 2013 -0700
+  Merge pull request #689 from BlackNiuza/application_status
+  [Bug fix: SPARK-796]
+
+  10c0593 Sun Jul 14 11:45:18 2013 -0700
+  Merge pull request #699 from pwendell/ui-env
+  [Add `Environment` tab to SparkUI.]
+
+  89e8549 Sat Jul 13 16:11:08 2013 -0700
+  Merge pull request #698 from Reinvigorate/sm-deps-change
+  [changing com.google.code.findbugs maven coordinates]
+
+  77c69ae Fri Jul 12 23:05:21 2013 -0700
+  Merge pull request #697 from pwendell/block-locations
+  [Show block locations in Web UI.]
+
+  5a7835c Fri Jul 12 20:28:21 2013 -0700
+  Merge pull request #691 from karenfeng/logpaging
+  [Create log pages]
+
+  71ccca0 Fri Jul 12 20:25:06 2013 -0700
+  Merge pull request #696 from woggle/executor-env
+  [Pass executor env vars (e.g. SPARK_CLASSPATH) to compute-classpath.sh]
+
+  90fc3f3 Fri Jul 12 20:23:36 2013 -0700
+  Merge pull request #692 from Reinvigorate/takeOrdered
+  [adding takeOrdered() to RDD]
+
+  018d04c Thu Jul 11 12:48:37 2013 -0700
+  Merge pull request #684 from woggle/mesos-classloader
+  [Explicitly set class loader for MesosSchedulerDriver callbacks.]
+
+  bc19477 Wed Jul 10 22:29:41 2013 -0700
+  Merge pull request #693 from c0s/readme
+  [Updating README to reflect Scala 2.9.3 requirements]
+
+  7dcda9a Mon Jul 8 23:24:23 2013 -0700
+  Merge pull request #688 from markhamstra/scalaDependencies
+  [Fixed SPARK-795 with explicit dependencies]
+
+  638927b Mon Jul 8 22:58:50 2013 -0700
+  Merge pull request #683 from shivaram/sbt-test-fix
+  [Remove some stack traces from sbt test output]
+
+  3c13178 Mon Jul 8 14:50:34 2013 -0700
+  Merge pull request #687 from atalwalkar/master
+  [Added "Labeled" to util functions for labeled data]
+
+  744da8e Sun Jul 7 17:42:25 2013 -0700
+  Merge pull request #679 from ryanlecompte/master
+  [Make binSearch method tail-recursive for RidgeRegression]
+
+  3cc6818 Sat Jul 6 19:51:20 2013 -0700
+  Merge pull request #668 from shimingfei/guava-14.0.1
+  [update guava version from 11.0.1 to 14.0.1]
+
+  2216188 Sat Jul 6 16:18:15 2013 -0700
+  Merge pull request #676 from c0s/asf-avro
+  [Use standard ASF published avro module instead of a proprietory built one]
+
+  94871e4 Sat Jul 6 15:26:19 2013 -0700
+  Merge pull request #655 from tgravescs/master
+  [Add support for running Spark on Yarn on a secure Hadoop Cluster]
+
+  3f918b3 Sat Jul 6 12:45:18 2013 -0700
+  Merge pull request #672 from holdenk/master
+  [s/ActorSystemImpl/ExtendedActorSystem/ as ActorSystemImpl results in a warning]
+
+  2a36e54 Sat Jul 6 12:43:21 2013 -0700
+  Merge pull request #673 from xiajunluan/master
+  [Add config template file for fair scheduler feature]
+
+  7ba7fa1 Sat Jul 6 11:45:08 2013 -0700
+  Merge pull request #674 from liancheng/master
+  [Bug fix: SPARK-789]
+
+  f4416a1 Sat Jul 6 11:41:58 2013 -0700
+  Merge pull request #681 from BlackNiuza/memory_leak
+  [Remove active job from idToActiveJob when job finished or aborted]
+
+  e063e29 Fri Jul 5 21:54:52 2013 -0700
+  Merge pull request #680 from tdas/master
+  [Fixed major performance bug in Network Receiver]
+
+  bf1311e Fri Jul 5 17:32:44 2013 -0700
+  Merge pull request #678 from mateiz/ml-examples
+  [Start of ML package]
+
+  6ad85d0 Thu Jul 4 21:32:29 2013 -0700
+  Merge pull request #677 from jerryshao/fix_stage_clean
+  [Clean StageToInfos periodically when spark.cleaner.ttl is enabled]
+
+  2e32fc8 Thu Jul 4 12:18:20 2013 -0700
+  Merge pull request #666 from c0s/master
+  [hbase dependency is missed in hadoop2-yarn profile of examples module
+]
+
+  6d60fe5 Mon Jul 1 18:24:03 2013 -0700
+  Merge pull request #666 from c0s/master
+  [hbase dependency is missed in hadoop2-yarn profile of examples module]
+
+  ccfe953 Sat Jun 29 17:57:53 2013 -0700
+  Merge pull request #577 from skumargithub/master
+  [Example of cumulative counting using updateStateByKey]
+
+  50ca176 Thu Jun 27 22:24:52 2013 -0700
+  Merge pull request #664 from pwendell/test-fix
+  [Removing incorrect test statement]
+
+  e49bc8c Wed Jun 26 11:13:33 2013 -0700
+  Merge pull request #663 from stephenh/option_and_getenv
+  [Be cute with Option and getenv.]
+
+  f5e32ed Tue Jun 25 09:16:57 2013 -0700
+  Merge pull request #661 from mesos/streaming
+  [Kafka fixes and DStream.count fix for master]
+
+  1249e91 Mon Jun 24 21:46:33 2013 -0700
+  Merge pull request #572 from Reinvigorate/sm-block-interval
+  [Adding spark.streaming.blockInterval property]
+
+  cfcda95 Mon Jun 24 21:44:50 2013 -0700
+  Merge pull request #571 from Reinvigorate/sm-kafka-serializers
+  [Surfacing decoders on KafkaInputDStream]
+
+  575aff6 Mon Jun 24 21:35:50 2013 -0700
+  Merge pull request #567 from Reinvigorate/sm-count-fix
+  [Fixing count() in Spark Streaming]
+
+  3e61bef Sat Jun 22 16:22:47 2013 -0700
+  Merge pull request #648 from shivaram/netty-dbg
+  [Shuffle fixes and cleanup]
+
+  1ef5d0d Sat Jun 22 09:35:57 2013 -0700
+  Merge pull request #644 from shimingfei/joblogger
+  [add Joblogger to Spark (on new Spark code)]
+
+  7e4b266 Sat Jun 22 07:53:18 2013 -0700
+  Merge pull request #563 from jey/python-optimization
+  [Optimize PySpark worker invocation]
+
+  71030ba Wed Jun 19 15:21:03 2013 -0700
+  Merge pull request #654 from lyogavin/enhance_pipe
+  [fix typo and coding style in #638]
+
+  73f4c7d Tue Jun 18 04:21:17 2013 -0700
+  Merge pull request #605 from esjewett/SPARK-699
+  [Add hBase example (retry of pull request #596)]
+
+  9933836 Tue Jun 18 02:41:10 2013 -0700
+  Merge pull request #647 from jerryshao/master
+  [Reduce ZippedPartitionsRDD's getPreferredLocations complexity from O(2^2n) to O(2^n)]
+
+  db42451 Mon Jun 17 15:26:36 2013 -0700
+  Merge pull request #643 from adatao/master
+  [Bug fix: Zero-length partitions result in NaN for overall mean & variance]
+
+  e82a2ff Mon Jun 17 15:13:15 2013 -0700
+  Merge pull request #653 from rxin/logging
+  [SPARK-781: Log the temp directory path when Spark says "Failed to create temp directory."]
+
+  e6d1277 Mon Jun 17 12:56:25 2013 -0700
+  Merge pull request #638 from lyogavin/enhance_pipe
+  [Enhance pipe to support more features we can do in hadoop streaming]
+
+  f961aac Sat Jun 15 00:53:41 2013 -0700
+  Merge pull request #649 from ryanlecompte/master
+  [Add top K method to RDD using a bounded priority queue]
+
+  6602d94 Fri Jun 14 10:41:31 2013 -0700
+  Merge pull request #651 from rxin/groupbykey
+  [SPARK-772 / SPARK-774: groupByKey and cogroup should disable map side combine]
+
+  d93851a Thu Jun 13 13:38:45 2013 -0700
+  Merge pull request #645 from pwendell/compression
+  [Adding compression to Hadoop save functions]
+
+  f1da591 Wed Jun 12 17:55:08 2013 -0700
+  Merge pull request #646 from markhamstra/jvmArgs
+  [Fixed jvmArgs in maven build.]
+
+  0e94b73 Mon Jun 10 13:00:31 2013 -0700
+  Merge pull request #625 from stephenh/fix-start-slave
+  [Fix start-slave not passing instance number to spark-daemon.]
+
+  74b91d5 Sat Jun 8 01:19:40 2013 -0700
+  Merge pull request #629 from c0s/master
+  [Sometime Maven build runs out of PermGen space.]
+
+  c8fc423 Fri Jun 7 22:43:18 2013 -0700
+  Merge pull request #631 from jerryshao/master
+  [Fix block manager UI display issue when enable spark.cleaner.ttl]
+
+  1ae60bc Fri Jun 7 22:39:06 2013 -0700
+  Merge pull request #634 from xiajunluan/master
+  [[Spark-753] Fix ClusterSchedulSuite unit test failed ]
+
+  fff3728 Tue Jun 4 16:09:50 2013 -0700
+  Merge pull request #640 from pwendell/timeout-update
+  [Fixing bug in BlockManager timeout]
+
+  f420d4f Tue Jun 4 15:25:58 2013 -0700
+  Merge pull request #639 from pwendell/timeout-update
+  [Bump akka and blockmanager timeouts to 60 seconds]
+
+  84530ba Fri May 31 17:06:13 2013 -0700
+  Merge pull request #636 from rxin/unpersist
+  [Unpersist More block manager cleanup.]
+
+  ef77bb7 Thu May 30 14:50:06 2013 -0700
+  Merge pull request #627 from shivaram/master
+  [Netty and shuffle  bug fixes]
+
+  8cb8178 Thu May 30 14:17:44 2013 -0700
+  Merge pull request #628 from shivaram/zero-block-size
+  [Skip fetching zero-sized blocks in NIO.]
+
+  6ed7139 Wed May 29 10:14:22 2013 -0700
+  Merge pull request #626 from stephenh/remove-add-if-no-port
+  [Remove unused addIfNoPort.]
+
+  41d230c Tue May 28 23:35:24 2013 -0700
+  Merge pull request #611 from squito/classloader
+  [Use default classloaders for akka & deserializing task results]
+
+  3db1e17 Mon May 27 21:31:43 2013 -0700
+  Merge pull request #620 from jerryshao/master
+  [Fix CheckpointRDD java.io.FileNotFoundException when calling getPreferredLocations]
+
+  3d4891d Sat May 25 23:38:05 2013 -0700
+  Merge pull request #621 from JoshRosen/spark-613
+  [Use ec2-metadata in start-slave.sh to detect if running on EC2]
+
+  e8d4b6c Sat May 25 21:09:03 2013 -0700
+  Merge pull request #529 from xiajunluan/master
+  [[SPARK-663]Implement Fair Scheduler in Spark Cluster Scheduler ]
+
+  9a3c344 Sat May 25 17:53:43 2013 -0700
+  Merge pull request #624 from rxin/master
+  [NonJavaSerializableClass should not be Java serializable...]
+
+  24e41aa Fri May 24 16:48:52 2013 -0700
+  Merge pull request #623 from rxin/master
+  [Automatically configure Netty port.]
+
+  69161f9 Fri May 24 14:42:13 2013 -0700
+  Merge pull request #622 from rxin/master
+  [bug fix: Shuffle block iterator is ignoring the shuffle serializer setting.]
+
+  dbbedfc Thu May 23 23:11:06 2013 -0700
+  Merge pull request #616 from jey/maven-netty-exclusion
+  [Exclude old versions of Netty from Maven-based build]
+
+  a2b0a79 Tue May 21 18:16:20 2013 -0700
+  Merge pull request #619 from woggling/adjust-sampling
+  [Use ARRAY_SAMPLE_SIZE constant instead of hard-coded 100.0 in SizeEstimator]
+
+  66dac44 Tue May 21 11:41:42 2013 -0700
+  Merge pull request #618 from woggling/dead-code-disttest
+  [DistributedSuite: remove dead code]
+
+  5912cc4 Fri May 17 19:58:40 2013 -0700
+  Merge pull request #610 from JoshRosen/spark-747
+  [Throw exception if TaskResult exceeds Akka frame size]
+
+  6c27c38 Thu May 16 17:33:56 2013 -0700
+  Merge pull request #615 from rxin/build-fix
+  [Maven build fix & two other small changes]
+
+  2f576ab Wed May 15 18:06:24 2013 -0700
+  Merge pull request #602 from rxin/shufflemerge
+  [Manual merge & cleanup of Shane's Shuffle Performance Optimization]
+
+  48c6f46 Wed May 15 10:47:19 2013 -0700
+  Merge pull request #612 from ash211/patch-4
+  [Docs: Mention spark shell's default for MASTER]
+
+  203d7b7 Wed May 15 00:47:20 2013 -0700
+  Merge pull request #593 from squito/driver_ui_link
+  [Master UI has link to Application UI]
+
+  016ac86 Mon May 13 21:45:36 2013 -0700
+  Merge pull request #601 from rxin/emptyrdd-master
+  [EmptyRDD (master branch 0.8)]
+
+  4b354e0 Mon May 13 17:39:19 2013 -0700
+  Merge pull request #589 from mridulm/master
+  [Add support for instance local scheduling]
+
+  5dbc9b2 Sun May 12 11:03:10 2013 -0700
+  Merge pull request #608 from pwendell/SPARK-738
+  [SPARK-738: Spark should detect and wrap nonserializable exceptions]
+
+  63e1999 Fri May 10 13:54:03 2013 -0700
+  Merge pull request #606 from markhamstra/foreachPartition_fix
+  [Actually use the cleaned closure in foreachPartition]
+
+  42bbe89 Wed May 8 22:30:31 2013 -0700
+  Merge pull request #599 from JoshRosen/spark-670
+  [Fix SPARK-670: EC2 'start' command should require -i option.]
+
+  0f1b7a0 Wed May 8 13:38:50 2013 -0700
+  Merge pull request #596 from esjewett/master
+  [hBase example]
+
+  7af92f2 Sat May 4 22:29:17 2013 -0700
+  Merge pull request #597 from JoshRosen/webui-fixes
+  [Two minor bug fixes for Spark Web UI]
+
+  c74ce60 Sat May 4 22:26:35 2013 -0700
+  Merge pull request #598 from rxin/blockmanager
+  [Fixed flaky unpersist test in DistributedSuite.]
+
+  3bf2c86 Fri May 3 18:27:30 2013 -0700
+  Merge pull request #594 from shivaram/master
+  [Add zip partitions to Java API]
+
+  2484ad7 Fri May 3 17:08:55 2013 -0700
+  Merge pull request #587 from rxin/blockmanager
+  [A set of shuffle map output related changes]
+
+  6fe9d4e Thu May 2 21:33:56 2013 -0700
+  Merge pull request #592 from woggling/localdir-fix
+  [Don't accept generated local directory names that can't be created]
+
+  538ee75 Thu May 2 09:01:42 2013 -0700
+  Merge pull request #581 from jerryshao/master
+  [fix [SPARK-740] block manage UI throws exception when enabling Spark Streaming]
+
+  9abcbcc Wed May 1 22:45:10 2013 -0700
+  Merge pull request #591 from rxin/removerdd
+  [RDD.unpersist: probably the most desired feature of Spark]
+
+  aa8fe1a Tue Apr 30 22:30:18 2013 -0700
+  Merge pull request #586 from mridulm/master
+  [Pull request to address issues Reynold Xin reported]
+
+  f708dda Tue Apr 30 07:51:40 2013 -0700
+  Merge pull request #585 from pwendell/listener-perf
+  [[Fix SPARK-742] Task Metrics should not employ per-record timing by default]
+
+  68c07ea Sun Apr 28 20:19:33 2013 -0700
+  Merge pull request #582 from shivaram/master
+  [Add zip partitions interface]
+
+  f6ee9a8 Sun Apr 28 15:36:04 2013 -0700
+  Merge pull request #583 from mridulm/master
+  [Fix issues with streaming test cases after yarn branch merge]
+
+  cf54b82 Thu Apr 25 11:45:58 2013 -0700
+  Merge pull request #580 from pwendell/quickstart
+  [SPARK-739 Have quickstart standlone job use README]
+
+  118a6c7 Wed Apr 24 08:42:30 2013 -0700
+  Merge pull request #575 from mridulm/master
+  [Manual merge of yarn branch to trunk]
+
+  5d8a71c Tue Apr 16 19:48:02 2013 -0700
+  Merge pull request #570 from jey/increase-codecache-size
+  [Increase ReservedCodeCacheSize for sbt]
+
+  ec5e553 Sun Apr 14 08:20:13 2013 -0700
+  Merge pull request #558 from ash211/patch-jackson-conflict
+  [Don't pull in old versions of Jackson via hadoop-core]
+
+  c1c219e Sun Apr 14 08:11:23 2013 -0700
+  Merge pull request #564 from maspotts/master
+  [Allow latest scala in PATH, with SCALA_HOME as override (instead of vice-versa)]
+
+  7c10b3e Fri Apr 12 20:55:22 2013 -0700
+  Merge pull request #565 from andyk/master
+  [Update wording of section on RDD operations in quick start guide in docs]
+
+  077ae0a Thu Apr 11 19:34:14 2013 -0700
+  Merge pull request #561 from ash211/patch-4
+  [Add details when BlockManager heartbeats time out]
+
+  c91ff8d Wed Apr 10 15:08:23 2013 -0700
+  Merge pull request #560 from ash211/patch-3
+  [Typos: cluser -> cluster]
+
+  7cd83bf Tue Apr 9 22:07:35 2013 -0700
+  Merge pull request #559 from ash211/patch-example-whitespace
+  [Uniform whitespace across scala examples]
+
+  271a4f3 Tue Apr 9 22:04:52 2013 -0700
+  Merge pull request #555 from holdenk/master
+  [Retry failed ssh commands in the ec2 python script.]
+
+  8ac9efb Tue Apr 9 13:50:50 2013 -0700
+  Merge pull request #527 from Reinvigorate/sm-kafka-cleanup
+  [KafkaInputDStream fixes and improvements]
+
+  eed54a2 Mon Apr 8 09:44:30 2013 -0700
+  Merge pull request #553 from pwendell/akka-standalone
+  [SPARK-724 - Have Akka logging enabled by default for standalone daemons]
+
+  b362df3 Sun Apr 7 17:17:52 2013 -0700
+  Merge pull request #552 from MLnick/master
+  [Bumping version for Twitter Algebird to latest]
+
+  4b30190 Sun Apr 7 17:15:10 2013 -0700
+  Merge pull request #554 from andyk/scala2.9.3
+  [Fixes SPARK-723 - Update build to Scala 2.9.3]
+
+  dfe98ca Tue Apr 2 19:24:12 2013 -0700
+  Merge pull request #550 from erikvanoosten/master
+  [corrected Algebird example]
+
+  b5d7830 Tue Apr 2 19:23:45 2013 -0700
+  Merge pull request #551 from jey/python-bugfixes
+  [Python bugfixes]
+
+  2be2295 Sun Mar 31 18:09:14 2013 -0700
+  Merge pull request #548 from markhamstra/getWritableClass_filter
+  [Fixed broken filter in getWritableClass[T]]
+
+  9831bc1 Fri Mar 29 22:16:22 2013 -0700
+  Merge pull request #539 from cgrothaus/fix-webui-workdirpath
+  [Bugfix: WorkerWebUI must respect workDirPath from Worker]
+
+  3cc8ab6 Fri Mar 29 22:14:07 2013 -0700
+  Merge pull request #541 from stephenh/shufflecoalesce
+  [Add a shuffle parameter to coalesce.]
+
+  cad507a Fri Mar 29 22:13:12 2013 -0700
+  Merge pull request #547 from jey/maven-streaming-tests-initialization-fix
+  [Move streaming test initialization into 'before' blocks]
+
+  a98996d Fri Mar 29 22:12:15 2013 -0700
+  Merge pull request #545 from ash211/patch-1
+  [Don't use deprecated Application in example]
+
+  104c694 Fri Mar 29 22:11:50 2013 -0700
+  Merge pull request #546 from ash211/patch-2
+  [Update tuning.md]
+
+  bc36ee4 Tue Mar 26 15:05:13 2013 -0700
+  Merge pull request #543 from holdenk/master
+  [Re-enable deprecation warnings and fix deprecated warning.]
+
+  b8949ca Sat Mar 23 07:19:34 2013 -0700
+  Merge pull request #505 from stephenh/volatile
+  [Make Executor fields volatile since they're read from the thread pool.]
+
+  fd53f2f Sat Mar 23 07:13:21 2013 -0700
+  Merge pull request #510 from markhamstra/WithThing
+  [mapWith, flatMapWith and filterWith]
+
+  4c5efcf Wed Mar 20 19:29:23 2013 -0700
+  Merge pull request #532 from andyk/master
+  [SPARK-715: Adds instructions for building with Maven to documentation]
+
+  3558849 Wed Mar 20 19:27:47 2013 -0700
+  Merge pull request #538 from rxin/cogroup
+  [Added mapSideCombine flag to CoGroupedRDD. Added unit test for CoGroupedRDD.]
+
+  ca4d083 Wed Mar 20 11:22:36 2013 -0700
+  Merge pull request #528 from MLnick/java-examples
+  [[SPARK-707] Adding Java versions of Pi, LogQuery and K-Means examples]
+
+  b812e6b Wed Mar 20 11:21:02 2013 -0700
+  Merge pull request #526 from markhamstra/foldByKey
+  [Add foldByKey]
+
+  945d1e7 Tue Mar 19 21:59:06 2013 -0700
+  Merge pull request #536 from sasurfer/master
+  [CoalescedRDD for many partitions]
+
+  1cbbe94 Tue Mar 19 21:34:34 2013 -0700
+  Merge pull request #534 from stephenh/removetrycatch
+  [Remove try/catch block that can't be hit.]
+
+  71e53f8 Tue Mar 19 21:31:41 2013 -0700
+  Merge pull request #537 from wishbear/configurableInputFormat
+  [call setConf from input format if it is Configurable]
+
+  c1e9cdc Sat Mar 16 11:47:45 2013 -0700
+  Merge pull request #525 from stephenh/subtractByKey
+  [Add PairRDDFunctions.subtractByKey.]
+
+  cdbfd1e Fri Mar 15 15:13:28 2013 -0700
+  Merge pull request #516 from squito/fix_local_metrics
+  [Fix local metrics]
+
+  f9fa2ad Fri Mar 15 15:12:43 2013 -0700
+  Merge pull request #530 from mbautin/master-update-log4j-and-make-compile-in-IntelliJ
+  [Add a log4j compile dependency to fix build in IntelliJ]
+
+  4032beb Wed Mar 13 19:29:46 2013 -0700
+  Merge pull request #521 from stephenh/earlyclose
+  [Close the reader in HadoopRDD as soon as iteration end.]
+
+  3c97276 Wed Mar 13 19:25:08 2013 -0700
+  Merge pull request #524 from andyk/master
+  [Fix broken link to YARN documentation]
+
+  1c3d981 Wed Mar 13 19:23:48 2013 -0700
+  Merge pull request #517 from Reinvigorate/sm-build-fixes
+  [Build fixes for streaming /w SBT]
+
+  2d477fd Wed Mar 13 06:49:16 2013 -0700
+  Merge pull request #523 from andyk/master
+  [Fix broken link in Quick Start]
+
+  00c4d23 Tue Mar 12 22:19:00 2013 -0700
+  Merge pull request #518 from woggling/long-bm-sizes
+  [Send block sizes as longs in BlockManager updates]
+
+  cbf8f0d Mon Mar 11 00:23:57 2013 -0700
+  Merge pull request #513 from MLnick/bagel-caching
+  [Adds choice of persistence level to Bagel.]
+
+  91a9d09 Sun Mar 10 15:48:23 2013 -0700
+  Merge pull request #512 from patelh/fix-kryo-serializer
+  [Fix reference bug in Kryo serializer, add test, update version]
+
+  557cfd0 Sun Mar 10 15:44:57 2013 -0700
+  Merge pull request #515 from woggling/deploy-app-death
+  [Notify standalone deploy client of application death.]
+
+  04fb81f Sun Mar 3 17:20:07 2013 -0800
+  Merge pull request #506 from rxin/spark-706
+  [Fixed SPARK-706: Failures in block manager put leads to read task hanging.]
+
+  6cf4be4 Sun Mar 3 17:16:22 2013 -0800
+  Merge pull request #462 from squito/stageInfo
+  [Track assorted metrics for each task, report summaries to user at stage completion]
+
+  6bfc7ca Sat Mar 2 22:14:49 2013 -0800
+  Merge pull request #504 from mosharaf/master
+  [Worker address was getting removed when removing an app.]
+
+  94b3db1 Sat Mar 2 22:13:52 2013 -0800
+  Merge pull request #508 from markhamstra/TestServerInUse
+  [Avoid bind failure in InputStreamsSuite]
+
+  25c71d3 Fri Mar 1 08:00:18 2013 -0800
+  Merge pull request #507 from markhamstra/poms271
+  [bump version to 0.7.1-SNAPSHOT in the subproject poms]
+
diff --git a/README.md b/README.md
index c840a68f76b17..dc8135b9b8b51 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
 # Apache Spark
 
-Lightning-Fast Cluster Computing - <http://spark.incubator.apache.org/>
+Lightning-Fast Cluster Computing - <http://spark.apache.org/>
 
 
 ## Online Documentation
 
 You can find the latest Spark documentation, including a programming
-guide, on the project webpage at <http://spark.incubator.apache.org/documentation.html>.
+guide, on the project webpage at <http://spark.apache.org/documentation.html>.
 This README file only contains basic setup instructions.
 
 
@@ -92,21 +92,10 @@ If your project is built with Maven, add this to your POM file's `<dependencies>
 
 ## Configuration
 
-Please refer to the [Configuration guide](http://spark.incubator.apache.org/docs/latest/configuration.html)
+Please refer to the [Configuration guide](http://spark.apache.org/docs/latest/configuration.html)
 in the online documentation for an overview on how to configure Spark.
 
 
-## Apache Incubator Notice
-
-Apache Spark is an effort undergoing incubation at The Apache Software
-Foundation (ASF), sponsored by the Apache Incubator. Incubation is required of
-all newly accepted projects until a further review indicates that the
-infrastructure, communications, and decision making process have stabilized in
-a manner consistent with other successful ASF projects. While incubation status
-is not necessarily a reflection of the completeness or stability of the code,
-it does indicate that the project has yet to be fully endorsed by the ASF.
-
-
 ## Contributing to Spark
 
 Contributions via GitHub pull requests are gladly accepted from their original
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 54a25910ced7d..cdae0f9f5e820 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,14 +21,24 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.1-incubating-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-assembly_2.10</artifactId>
   <name>Spark Project Assembly</name>
-  <url>http://spark.incubator.apache.org/</url>
+  <url>http://spark.apache.org/</url>
+  <packaging>pom</packaging>
+
+  <properties>
+    <spark.jar.dir>scala-${scala.binary.version}</spark.jar.dir>
+    <spark.jar.basename>${project.artifactId}-${project.version}-hadoop${hadoop.version}.jar</spark.jar.basename>
+    <spark.jar>${project.build.directory}/${spark.jar.dir}/${spark.jar.basename}</spark.jar>
+    <deb.pkg.name>spark</deb.pkg.name>
+    <deb.install.path>/usr/share/spark</deb.install.path>
+    <deb.user>root</deb.user>
+  </properties>
 
   <repositories>
     <!-- A repository in the local filesystem for the Py4J JAR, which is not in Maven central -->
@@ -64,6 +74,11 @@
       <artifactId>spark-streaming_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-graphx_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>net.sf.py4j</groupId>
       <artifactId>py4j</artifactId>
@@ -79,7 +94,7 @@
         <artifactId>maven-shade-plugin</artifactId>
         <configuration>
           <shadedArtifactAttached>false</shadedArtifactAttached>
-          <outputFile>${project.build.directory}/scala-${scala.binary.version}/${project.artifactId}-${project.version}-hadoop${hadoop.version}.jar</outputFile>
+          <outputFile>${spark.jar}</outputFile>
           <artifactSet>
             <includes>
               <include>*:*</include>
@@ -143,6 +158,16 @@
         </dependency>
       </dependencies>
     </profile>
+    <profile>
+      <id>spark-ganglia-lgpl</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-ganglia-lgpl_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
     <profile>
       <id>bigtop-dist</id>
       <!-- This profile uses the assembly plugin to create a special "dist" package for BigTop
@@ -171,5 +196,112 @@
         </plugins>
       </build>
     </profile>
+    <profile>
+      <id>deb</id>
+      <build>
+        <plugins>
+          <plugin>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>buildnumber-maven-plugin</artifactId>
+            <version>1.1</version>
+            <executions>
+              <execution>
+                <phase>validate</phase>
+                <goals>
+                  <goal>create</goal>
+                </goals>
+                <configuration>
+                  <shortRevisionLength>8</shortRevisionLength>
+                </configuration>
+              </execution>
+            </executions>
+          </plugin>
+          <plugin>
+            <groupId>org.vafer</groupId>
+            <artifactId>jdeb</artifactId>
+            <version>0.11</version>
+            <executions>
+              <execution>
+                <phase>package</phase>
+                <goals>
+                  <goal>jdeb</goal>
+                </goals>
+                <configuration>
+                  <deb>${project.build.directory}/${deb.pkg.name}_${project.version}-${buildNumber}_all.deb</deb>
+                  <attach>false</attach>
+                  <compression>gzip</compression>
+                  <dataSet>
+                    <data>
+                      <src>${spark.jar}</src>
+                      <type>file</type>
+                      <mapper>
+                        <type>perm</type>
+                        <user>${deb.user}</user>
+                        <group>${deb.user}</group>
+                        <prefix>${deb.install.path}/jars</prefix>
+                      </mapper>
+                    </data>
+                    <data>
+                      <src>${basedir}/src/deb/RELEASE</src>
+                      <type>file</type>
+                      <mapper>
+                        <type>perm</type>
+                        <user>${deb.user}</user>
+                        <group>${deb.user}</group>
+                        <prefix>${deb.install.path}</prefix>
+                      </mapper>
+                    </data>
+                    <data>
+                      <src>${basedir}/../conf</src>
+                      <type>directory</type>
+                      <mapper>
+                        <type>perm</type>
+                        <user>${deb.user}</user>
+                        <group>${deb.user}</group>
+                        <prefix>${deb.install.path}/conf</prefix>
+                        <filemode>744</filemode>
+                      </mapper>
+                    </data>
+                    <data>
+                      <src>${basedir}/../bin</src>
+                      <type>directory</type>
+                      <mapper>
+                        <type>perm</type>
+                        <user>${deb.user}</user>
+                        <group>${deb.user}</group>
+                        <prefix>${deb.install.path}/bin</prefix>
+                        <filemode>744</filemode>
+                      </mapper>
+                    </data>
+                    <data>
+                      <src>${basedir}/../sbin</src>
+                      <type>directory</type>
+                      <mapper>
+                        <type>perm</type>
+                        <user>${deb.user}</user>
+                        <group>${deb.user}</group>
+                        <prefix>${deb.install.path}/sbin</prefix>
+                        <filemode>744</filemode>
+                      </mapper>
+                    </data>
+                    <data>
+                      <src>${basedir}/../python</src>
+                      <type>directory</type>
+                      <mapper>
+                        <type>perm</type>
+                        <user>${deb.user}</user>
+                        <group>${deb.user}</group>
+                        <prefix>${deb.install.path}/python</prefix>
+                        <filemode>744</filemode>
+                      </mapper>
+                    </data>
+                  </dataSet>
+                </configuration>
+              </execution>
+            </executions>
+          </plugin>
+        </plugins>
+      </build>
+    </profile>
   </profiles>
 </project>
diff --git a/assembly/src/deb/RELEASE b/assembly/src/deb/RELEASE
new file mode 100644
index 0000000000000..aad50ee73aa45
--- /dev/null
+++ b/assembly/src/deb/RELEASE
@@ -0,0 +1,2 @@
+compute-classpath.sh uses the existence of this file to decide whether to put the assembly jar on the
+classpath or instead to use classfiles in the source tree. 
\ No newline at end of file
diff --git a/repl-bin/src/deb/control/control b/assembly/src/deb/control/control
similarity index 100%
rename from repl-bin/src/deb/control/control
rename to assembly/src/deb/control/control
diff --git a/assembly/src/main/assembly/assembly.xml b/assembly/src/main/assembly/assembly.xml
index 6af383db65d47..711156337b7c3 100644
--- a/assembly/src/main/assembly/assembly.xml
+++ b/assembly/src/main/assembly/assembly.xml
@@ -55,6 +55,15 @@
         <include>**/*</include>
       </includes>
     </fileSet>
+    <fileSet>
+      <directory>
+        ${project.parent.basedir}/assembly/target/${spark.jar.dir}
+      </directory>
+      <outputDirectory>/</outputDirectory>
+      <includes>
+        <include>${spark.jar.basename}</include>
+      </includes>
+    </fileSet>
   </fileSets>
 
   <dependencySets>
@@ -75,6 +84,8 @@
       <excludes>
         <exclude>org.apache.hadoop:*:jar</exclude>
         <exclude>org.apache.spark:*:jar</exclude>
+        <exclude>org.apache.zookeeper:*:jar</exclude>
+        <exclude>org.apache.avro:*:jar</exclude>
       </excludes>
     </dependencySet>
   </dependencySets>
diff --git a/bagel/pom.xml b/bagel/pom.xml
index cb8e79f22535b..42e624402f77e 100644
--- a/bagel/pom.xml
+++ b/bagel/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.1-incubating-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/bin/run-example b/bin/run-example
index 2e9d51440bd5d..adba7dd97aaf8 100755
--- a/bin/run-example
+++ b/bin/run-example
@@ -76,11 +76,20 @@ else
   fi
 fi
 
+# Set JAVA_OPTS to be able to load native libraries and to set heap size
+JAVA_OPTS="$SPARK_JAVA_OPTS"
+JAVA_OPTS="$JAVA_OPTS -Djava.library.path=$SPARK_LIBRARY_PATH"
+# Load extra JAVA_OPTS from conf/java-opts, if it exists
+if [ -e "$FWDIR/conf/java-opts" ] ; then
+  JAVA_OPTS="$JAVA_OPTS `cat $FWDIR/conf/java-opts`"
+fi
+export JAVA_OPTS
+
 if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
   echo -n "Spark Command: "
-  echo "$RUNNER" -cp "$CLASSPATH" "$@"
+  echo "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
   echo "========================================"
   echo
 fi
 
-exec "$RUNNER" -cp "$CLASSPATH" "$@"
+exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
diff --git a/bin/run-example2.cmd b/bin/run-example2.cmd
index 6861334cb07e5..40abb9af74246 100644
--- a/bin/run-example2.cmd
+++ b/bin/run-example2.cmd
@@ -49,7 +49,7 @@ if "x%SPARK_EXAMPLES_JAR%"=="x" (
 
 rem Compute Spark classpath using external script
 set DONT_PRINT_CLASSPATH=1
-call "%FWDIR%sbin\compute-classpath.cmd"
+call "%FWDIR%bin\compute-classpath.cmd"
 set DONT_PRINT_CLASSPATH=0
 set CLASSPATH=%SPARK_EXAMPLES_JAR%;%CLASSPATH%
 
diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd
old mode 100644
new mode 100755
index 460e6614766f8..80818c78ec24b
--- a/bin/spark-class2.cmd
+++ b/bin/spark-class2.cmd
@@ -73,7 +73,7 @@ for %%d in ("%TOOLS_DIR%\target\scala-%SCALA_VERSION%\spark-tools*assembly*.jar"
 
 rem Compute classpath using external script
 set DONT_PRINT_CLASSPATH=1
-call "%FWDIR%sbin\compute-classpath.cmd"
+call "%FWDIR%bin\compute-classpath.cmd"
 set DONT_PRINT_CLASSPATH=0
 set CLASSPATH=%CLASSPATH%;%SPARK_TOOLS_JAR%
 
diff --git a/bin/spark-shell b/bin/spark-shell
index e6885b51ef567..05a46ee0caf55 100755
--- a/bin/spark-shell
+++ b/bin/spark-shell
@@ -45,13 +45,18 @@ for o in "$@"; do
 done
 
 # Set MASTER from spark-env if possible
+DEFAULT_SPARK_MASTER_PORT=7077
 if [ -z "$MASTER" ]; then
   if [ -e "$FWDIR/conf/spark-env.sh" ]; then
     . "$FWDIR/conf/spark-env.sh"
   fi
-  if [[ "x" != "x$SPARK_MASTER_IP" && "y" != "y$SPARK_MASTER_PORT" ]]; then
-    MASTER="spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}"
-    export MASTER
+  if [ "x" != "x$SPARK_MASTER_IP" ]; then
+    if [ "y" != "y$SPARK_MASTER_PORT" ]; then
+      SPARK_MASTER_PORT="${SPARK_MASTER_PORT}"
+    else
+      SPARK_MASTER_PORT=$DEFAULT_SPARK_MASTER_PORT
+    fi
+    export MASTER="spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}"
   fi
 fi
 
diff --git a/bin/spark-shell.cmd b/bin/spark-shell.cmd
old mode 100644
new mode 100755
index 23973e3e3dd43..99799128eb734
--- a/bin/spark-shell.cmd
+++ b/bin/spark-shell.cmd
@@ -18,6 +18,6 @@ rem limitations under the License.
 rem
 
 rem Find the path of sbin
-set SBIN=%~dp0..\sbin\
+set BIN=%~dp0..\bin\
 
-cmd /V /E /C %SBIN%spark-class2.cmd org.apache.spark.repl.Main %*
+cmd /V /E /C %BIN%spark-class2.cmd org.apache.spark.repl.Main %*
diff --git a/core/pom.xml b/core/pom.xml
index 9e5a450d57a47..6b6167c34822f 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -21,7 +21,7 @@
     <parent>
         <groupId>org.apache.spark</groupId>
         <artifactId>spark-parent</artifactId>
-        <version>0.9.0-incubating-SNAPSHOT</version>
+        <version>0.9.1-incubating-SNAPSHOT</version>
         <relativePath>../pom.xml</relativePath>
     </parent>
 
@@ -147,10 +147,6 @@
             <groupId>com.codahale.metrics</groupId>
             <artifactId>metrics-json</artifactId>
         </dependency>
-        <dependency>
-            <groupId>com.codahale.metrics</groupId>
-            <artifactId>metrics-ganglia</artifactId>
-        </dependency>
         <dependency>
             <groupId>com.codahale.metrics</groupId>
             <artifactId>metrics-graphite</artifactId>
@@ -193,7 +189,6 @@
         <dependency>
             <groupId>org.slf4j</groupId>
             <artifactId>slf4j-log4j12</artifactId>
-            <scope>test</scope>
         </dependency>
     </dependencies>
     <build>
diff --git a/core/src/main/scala/org/apache/spark/Accumulators.scala b/core/src/main/scala/org/apache/spark/Accumulators.scala
index 2ba871a6007d7..de811eaf0d4bd 100644
--- a/core/src/main/scala/org/apache/spark/Accumulators.scala
+++ b/core/src/main/scala/org/apache/spark/Accumulators.scala
@@ -17,17 +17,17 @@
 
 package org.apache.spark
 
-import java.io._
+import java.io.{ObjectInputStream, Serializable}
 
 import scala.collection.mutable.Map
 import scala.collection.generic.Growable
 import org.apache.spark.serializer.JavaSerializer
 
 /**
- * A datatype that can be accumulated, ie has an commutative and associative "add" operation,
+ * A data type that can be accumulated, ie has an commutative and associative "add" operation,
  * but where the result type, `R`, may be different from the element type being added, `T`.
  *
- * You must define how to add data, and how to merge two of these together.  For some datatypes,
+ * You must define how to add data, and how to merge two of these together.  For some data types,
  * such as a counter, these might be the same operation. In that case, you can use the simpler
  * [[org.apache.spark.Accumulator]]. They won't always be the same, though -- e.g., imagine you are
  * accumulating a set. You will add items to the set, and you will union two sets together.
@@ -45,7 +45,7 @@ class Accumulable[R, T] (
   val id = Accumulators.newId
   @transient private var value_ = initialValue // Current value on master
   val zero = param.zero(initialValue)  // Zero value to be passed to workers
-  var deserialized = false
+  private var deserialized = false
 
   Accumulators.register(this, true)
 
@@ -127,7 +127,7 @@ class Accumulable[R, T] (
 
 /**
  * Helper object defining how to accumulate values of a particular type. An implicit
- * AccumulableParam needs to be available when you create Accumulables of a specific type.
+ * AccumulableParam needs to be available when you create [[Accumulable]]s of a specific type.
  *
  * @tparam R the full accumulated data (result type)
  * @tparam T partial data that can be added in
@@ -186,7 +186,29 @@ class GrowableAccumulableParam[R <% Growable[T] with TraversableOnce[T] with Ser
 
 /**
  * A simpler value of [[Accumulable]] where the result type being accumulated is the same
- * as the types of elements being merged.
+ * as the types of elements being merged, i.e. variables that are only "added" to through an
+ * associative operation and can therefore be efficiently supported in parallel. They can be used
+ * to implement counters (as in MapReduce) or sums. Spark natively supports accumulators of numeric
+ * value types, and programmers can add support for new types.
+ *
+ * An accumulator is created from an initial value `v` by calling [[SparkContext#accumulator]].
+ * Tasks running on the cluster can then add to it using the [[Accumulable#+=]] operator.
+ * However, they cannot read its value. Only the driver program can read the accumulator's value,
+ * using its value method.
+ *
+ * The interpreter session below shows an accumulator being used to add up the elements of an array:
+ *
+ * {{{
+ * scala> val accum = sc.accumulator(0)
+ * accum: spark.Accumulator[Int] = 0
+ *
+ * scala> sc.parallelize(Array(1, 2, 3, 4)).foreach(x => accum += x)
+ * ...
+ * 10/09/29 18:41:08 INFO SparkContext: Tasks finished in 0.317106 s
+ *
+ * scala> accum.value
+ * res2: Int = 10
+ * }}}
  *
  * @param initialValue initial value of accumulator
  * @param param helper object defining how to add elements of type `T`
@@ -196,9 +218,9 @@ class Accumulator[T](@transient initialValue: T, param: AccumulatorParam[T])
   extends Accumulable[T,T](initialValue, param)
 
 /**
- * A simpler version of [[org.apache.spark.AccumulableParam]] where the only datatype you can add in is the same type
- * as the accumulated value. An implicit AccumulatorParam object needs to be available when you create
- * Accumulators of a specific type.
+ * A simpler version of [[org.apache.spark.AccumulableParam]] where the only data type you can add
+ * in is the same type as the accumulated value. An implicit AccumulatorParam object needs to be
+ * available when you create Accumulators of a specific type.
  *
  * @tparam T type of value to accumulate
  */
diff --git a/core/src/main/scala/org/apache/spark/Aggregator.scala b/core/src/main/scala/org/apache/spark/Aggregator.scala
index 6d439fdc684af..c4579cf6ad560 100644
--- a/core/src/main/scala/org/apache/spark/Aggregator.scala
+++ b/core/src/main/scala/org/apache/spark/Aggregator.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark
 
+import scala.{Option, deprecated}
+
 import org.apache.spark.util.collection.{AppendOnlyMap, ExternalAppendOnlyMap}
 
 /**
@@ -31,11 +33,14 @@ case class Aggregator[K, V, C] (
     mergeValue: (C, V) => C,
     mergeCombiners: (C, C) => C) {
 
-  private val sparkConf = SparkEnv.get.conf
-  private val externalSorting = sparkConf.getBoolean("spark.shuffle.spill", true)
+  private val externalSorting = SparkEnv.get.conf.getBoolean("spark.shuffle.spill", true)
+
+  @deprecated("use combineValuesByKey with TaskContext argument", "0.9.0")
+  def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]]): Iterator[(K, C)] =
+    combineValuesByKey(iter, null)
 
   def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]],
-                         context: TaskContext) : Iterator[(K, C)] = {
+                         context: TaskContext): Iterator[(K, C)] = {
     if (!externalSorting) {
       val combiners = new AppendOnlyMap[K,C]
       var kv: Product2[K, V] = null
@@ -53,12 +58,17 @@ case class Aggregator[K, V, C] (
         val (k, v) = iter.next()
         combiners.insert(k, v)
       }
-      context.taskMetrics.memoryBytesSpilled = combiners.memoryBytesSpilled
-      context.taskMetrics.diskBytesSpilled = combiners.diskBytesSpilled
+      // TODO: Make this non optional in a future release
+      Option(context).foreach(c => c.taskMetrics.memoryBytesSpilled = combiners.memoryBytesSpilled)
+      Option(context).foreach(c => c.taskMetrics.diskBytesSpilled = combiners.diskBytesSpilled)
       combiners.iterator
     }
   }
 
+  @deprecated("use combineCombinersByKey with TaskContext argument", "0.9.0")
+  def combineCombinersByKey(iter: Iterator[(K, C)]) : Iterator[(K, C)] =
+    combineCombinersByKey(iter, null)
+
   def combineCombinersByKey(iter: Iterator[(K, C)], context: TaskContext) : Iterator[(K, C)] = {
     if (!externalSorting) {
       val combiners = new AppendOnlyMap[K,C]
@@ -77,8 +87,9 @@ case class Aggregator[K, V, C] (
         val (k, c) = iter.next()
         combiners.insert(k, c)
       }
-      context.taskMetrics.memoryBytesSpilled = combiners.memoryBytesSpilled
-      context.taskMetrics.diskBytesSpilled = combiners.diskBytesSpilled
+      // TODO: Make this non optional in a future release
+      Option(context).foreach(c => c.taskMetrics.memoryBytesSpilled = combiners.memoryBytesSpilled)
+      Option(context).foreach(c => c.taskMetrics.diskBytesSpilled = combiners.diskBytesSpilled)
       combiners.iterator
     }
   }
diff --git a/core/src/main/scala/org/apache/spark/Logging.scala b/core/src/main/scala/org/apache/spark/Logging.scala
index b749e5414dab6..7423082e34f47 100644
--- a/core/src/main/scala/org/apache/spark/Logging.scala
+++ b/core/src/main/scala/org/apache/spark/Logging.scala
@@ -19,6 +19,7 @@ package org.apache.spark
 
 import org.apache.log4j.{LogManager, PropertyConfigurator}
 import org.slf4j.{Logger, LoggerFactory}
+import org.slf4j.impl.StaticLoggerBinder
 
 /**
  * Utility trait for classes that want to log data. Creates a SLF4J logger for the class and allows
@@ -101,9 +102,11 @@ trait Logging {
   }
 
   private def initializeLogging() {
-    // If Log4j doesn't seem initialized, load a default properties file
+    // If Log4j is being used, but is not initialized, load a default properties file
+    val binder = StaticLoggerBinder.getSingleton
+    val usingLog4j = binder.getLoggerFactoryClassStr.endsWith("Log4jLoggerFactory")
     val log4jInitialized = LogManager.getRootLogger.getAllAppenders.hasMoreElements
-    if (!log4jInitialized) {
+    if (!log4jInitialized && usingLog4j) {
       val defaultLogProps = "org/apache/spark/log4j-defaults.properties"
       val classLoader = this.getClass.getClassLoader
       Option(classLoader.getResource(defaultLogProps)) match {
diff --git a/core/src/main/scala/org/apache/spark/Partitioner.scala b/core/src/main/scala/org/apache/spark/Partitioner.scala
index fc0a7498820b5..cfba43dec3111 100644
--- a/core/src/main/scala/org/apache/spark/Partitioner.scala
+++ b/core/src/main/scala/org/apache/spark/Partitioner.scala
@@ -41,7 +41,7 @@ object Partitioner {
    * spark.default.parallelism is set, then we'll use the value from SparkContext
    * defaultParallelism, otherwise we'll use the max number of upstream partitions.
    *
-   * Unless spark.default.parallelism is set, He number of partitions will be the
+   * Unless spark.default.parallelism is set, the number of partitions will be the
    * same as the number of partitions in the largest upstream RDD, as this should
    * be least likely to cause out-of-memory errors.
    *
@@ -49,7 +49,7 @@ object Partitioner {
    */
   def defaultPartitioner(rdd: RDD[_], others: RDD[_]*): Partitioner = {
     val bySize = (Seq(rdd) ++ others).sortBy(_.partitions.size).reverse
-    for (r <- bySize if r.partitioner != None) {
+    for (r <- bySize if r.partitioner.isDefined) {
       return r.partitioner.get
     }
     if (rdd.context.conf.contains("spark.default.parallelism")) {
diff --git a/core/src/main/scala/org/apache/spark/SparkConf.scala b/core/src/main/scala/org/apache/spark/SparkConf.scala
index 2de32231e8714..45d19bcbfa6f2 100644
--- a/core/src/main/scala/org/apache/spark/SparkConf.scala
+++ b/core/src/main/scala/org/apache/spark/SparkConf.scala
@@ -1,20 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.HashMap
 
-import com.typesafe.config.ConfigFactory
+import java.io.{ObjectInputStream, ObjectOutputStream, IOException}
 
 /**
  * Configuration for a Spark application. Used to set various Spark parameters as key-value pairs.
  *
  * Most of the time, you would create a SparkConf object with `new SparkConf()`, which will load
- * values from both the `spark.*` Java system properties and any `spark.conf` on your application's
- * classpath (if it has one). In this case, system properties take priority over `spark.conf`, and
- * any parameters you set directly on the `SparkConf` object take priority over both of those.
+ * values from any `spark.*` Java system properties set in your application as well. In this case,
+ * parameters you set directly on the `SparkConf` object take priority over system properties.
  *
  * For unit tests, you can also call `new SparkConf(false)` to skip loading external settings and
- * get the same configuration no matter what is on the classpath.
+ * get the same configuration no matter what the system properties are.
  *
  * All setter methods in this class support chaining. For example, you can write
  * `new SparkConf().setMaster("local").setAppName("My app")`.
@@ -22,9 +38,9 @@ import com.typesafe.config.ConfigFactory
  * Note that once a SparkConf object is passed to Spark, it is cloned and can no longer be modified
  * by the user. Spark does not support modifying the configuration at runtime.
  *
- * @param loadDefaults whether to load values from the system properties and classpath
+ * @param loadDefaults whether to also load values from Java system properties
  */
-class SparkConf(loadDefaults: Boolean) extends Serializable with Cloneable with Logging {
+class SparkConf(loadDefaults: Boolean) extends Cloneable with Logging {
 
   /** Create a SparkConf that loads defaults from system properties and the classpath */
   def this() = this(true)
@@ -32,11 +48,9 @@ class SparkConf(loadDefaults: Boolean) extends Serializable with Cloneable with
   private val settings = new HashMap[String, String]()
 
   if (loadDefaults) {
-    ConfigFactory.invalidateCaches()
-    val typesafeConfig = ConfigFactory.systemProperties()
-      .withFallback(ConfigFactory.parseResources("spark.conf"))
-    for (e <- typesafeConfig.entrySet().asScala if e.getKey.startsWith("spark.")) {
-      settings(e.getKey) = e.getValue.unwrapped.toString
+    // Load any spark.* system properties
+    for ((k, v) <- System.getProperties.asScala if k.startsWith("spark.")) {
+      settings(k) = v
     }
   }
 
@@ -178,7 +192,15 @@ class SparkConf(loadDefaults: Boolean) extends Serializable with Cloneable with
   }
 
   /** Get all akka conf variables set on this SparkConf */
-  def getAkkaConf: Seq[(String, String)] =  getAll.filter {case (k, v) => k.startsWith("akka.")}
+  def getAkkaConf: Seq[(String, String)] =
+    /* This is currently undocumented. If we want to make this public we should consider
+     * nesting options under the spark namespace to avoid conflicts with user akka options.
+     * Otherwise users configuring their own akka code via system properties could mess up
+     * spark's akka options.
+     *
+     *   E.g. spark.akka.option.x.y.x = "value"
+     */
+    getAll.filter {case (k, v) => k.startsWith("akka.")}
 
   /** Does the configuration contain a given parameter? */
   def contains(key: String): Boolean = settings.contains(key)
diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 55ac76bf63909..566472e597958 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -340,8 +340,8 @@ class SparkContext(
    * Hadoop-supported file system URI, and return it as an RDD of Strings.
    */
   def textFile(path: String, minSplits: Int = defaultMinSplits): RDD[String] = {
-    hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], minSplits)
-      .map(pair => pair._2.toString)
+    hadoopFile(path, classOf[TextInputFormat], classOf[LongWritable], classOf[Text],
+      minSplits).map(pair => pair._2.toString)
   }
 
   /**
@@ -354,33 +354,37 @@ class SparkContext(
    * @param keyClass Class of the keys
    * @param valueClass Class of the values
    * @param minSplits Minimum number of Hadoop Splits to generate.
-   * @param cloneRecords If true, Spark will clone the records produced by Hadoop RecordReader.
-   *                     Most RecordReader implementations reuse wrapper objects across multiple
-   *                     records, and can cause problems in RDD collect or aggregation operations.
-   *                     By default the records are cloned in Spark. However, application
-   *                     programmers can explicitly disable the cloning for better performance.
+   *
+   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * record, directly caching the returned RDD will create many references to the same object.
+   * If you plan to directly cache Hadoop writable objects, you should first copy them using
+   * a `map` function.
    */
-  def hadoopRDD[K: ClassTag, V: ClassTag](
+  def hadoopRDD[K, V](
       conf: JobConf,
       inputFormatClass: Class[_ <: InputFormat[K, V]],
       keyClass: Class[K],
       valueClass: Class[V],
-      minSplits: Int = defaultMinSplits,
-      cloneRecords: Boolean = true
+      minSplits: Int = defaultMinSplits
       ): RDD[(K, V)] = {
     // Add necessary security credentials to the JobConf before broadcasting it.
     SparkHadoopUtil.get.addCredentials(conf)
-    new HadoopRDD(this, conf, inputFormatClass, keyClass, valueClass, minSplits, cloneRecords)
+    new HadoopRDD(this, conf, inputFormatClass, keyClass, valueClass, minSplits)
   }
 
-  /** Get an RDD for a Hadoop file with an arbitrary InputFormat */
-  def hadoopFile[K: ClassTag, V: ClassTag](
+  /** Get an RDD for a Hadoop file with an arbitrary InputFormat
+    *
+    * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+    * record, directly caching the returned RDD will create many references to the same object.
+    * If you plan to directly cache Hadoop writable objects, you should first copy them using
+    * a `map` function.
+    * */
+  def hadoopFile[K, V](
       path: String,
       inputFormatClass: Class[_ <: InputFormat[K, V]],
       keyClass: Class[K],
       valueClass: Class[V],
-      minSplits: Int = defaultMinSplits,
-      cloneRecords: Boolean = true
+      minSplits: Int = defaultMinSplits
       ): RDD[(K, V)] = {
     // A Hadoop configuration can be about 10 KB, which is pretty big, so broadcast it.
     val confBroadcast = broadcast(new SerializableWritable(hadoopConfiguration))
@@ -392,8 +396,7 @@ class SparkContext(
       inputFormatClass,
       keyClass,
       valueClass,
-      minSplits,
-      cloneRecords)
+      minSplits)
   }
 
   /**
@@ -403,16 +406,20 @@ class SparkContext(
    * {{{
    * val file = sparkContext.hadoopFile[LongWritable, Text, TextInputFormat](path, minSplits)
    * }}}
+   *
+   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * record, directly caching the returned RDD will create many references to the same object.
+   * If you plan to directly cache Hadoop writable objects, you should first copy them using
+   * a `map` function.
    */
   def hadoopFile[K, V, F <: InputFormat[K, V]]
-      (path: String, minSplits: Int, cloneRecords: Boolean = true)
+      (path: String, minSplits: Int)
       (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = {
     hadoopFile(path,
       fm.runtimeClass.asInstanceOf[Class[F]],
       km.runtimeClass.asInstanceOf[Class[K]],
       vm.runtimeClass.asInstanceOf[Class[V]],
-      minSplits,
-      cloneRecords)
+      minSplits)
   }
 
   /**
@@ -422,68 +429,91 @@ class SparkContext(
    * {{{
    * val file = sparkContext.hadoopFile[LongWritable, Text, TextInputFormat](path)
    * }}}
+   *
+   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * record, directly caching the returned RDD will create many references to the same object.
+   * If you plan to directly cache Hadoop writable objects, you should first copy them using
+   * a `map` function.
    */
-  def hadoopFile[K, V, F <: InputFormat[K, V]](path: String, cloneRecords: Boolean = true)
+  def hadoopFile[K, V, F <: InputFormat[K, V]](path: String)
       (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] =
-    hadoopFile[K, V, F](path, defaultMinSplits, cloneRecords)
+    hadoopFile[K, V, F](path, defaultMinSplits)
 
   /** Get an RDD for a Hadoop file with an arbitrary new API InputFormat. */
   def newAPIHadoopFile[K, V, F <: NewInputFormat[K, V]]
-      (path: String, cloneRecords: Boolean = true)
+      (path: String)
       (implicit km: ClassTag[K], vm: ClassTag[V], fm: ClassTag[F]): RDD[(K, V)] = {
     newAPIHadoopFile(
       path,
       fm.runtimeClass.asInstanceOf[Class[F]],
       km.runtimeClass.asInstanceOf[Class[K]],
-      vm.runtimeClass.asInstanceOf[Class[V]],
-      cloneRecords = cloneRecords)
+      vm.runtimeClass.asInstanceOf[Class[V]])
   }
 
   /**
    * Get an RDD for a given Hadoop file with an arbitrary new API InputFormat
    * and extra configuration options to pass to the input format.
+   *
+   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * record, directly caching the returned RDD will create many references to the same object.
+   * If you plan to directly cache Hadoop writable objects, you should first copy them using
+   * a `map` function.
    */
-  def newAPIHadoopFile[K: ClassTag, V: ClassTag, F <: NewInputFormat[K, V]](
+  def newAPIHadoopFile[K, V, F <: NewInputFormat[K, V]](
       path: String,
       fClass: Class[F],
       kClass: Class[K],
       vClass: Class[V],
-      conf: Configuration = hadoopConfiguration,
-      cloneRecords: Boolean = true): RDD[(K, V)] = {
+      conf: Configuration = hadoopConfiguration): RDD[(K, V)] = {
     val job = new NewHadoopJob(conf)
     NewFileInputFormat.addInputPath(job, new Path(path))
     val updatedConf = job.getConfiguration
-    new NewHadoopRDD(this, fClass, kClass, vClass, updatedConf, cloneRecords)
+    new NewHadoopRDD(this, fClass, kClass, vClass, updatedConf)
   }
 
   /**
    * Get an RDD for a given Hadoop file with an arbitrary new API InputFormat
    * and extra configuration options to pass to the input format.
+   *
+   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * record, directly caching the returned RDD will create many references to the same object.
+   * If you plan to directly cache Hadoop writable objects, you should first copy them using
+   * a `map` function.
    */
-  def newAPIHadoopRDD[K: ClassTag, V: ClassTag, F <: NewInputFormat[K, V]](
+  def newAPIHadoopRDD[K, V, F <: NewInputFormat[K, V]](
       conf: Configuration = hadoopConfiguration,
       fClass: Class[F],
       kClass: Class[K],
-      vClass: Class[V],
-      cloneRecords: Boolean = true): RDD[(K, V)] = {
-    new NewHadoopRDD(this, fClass, kClass, vClass, conf, cloneRecords)
-  }
-
-  /** Get an RDD for a Hadoop SequenceFile with given key and value types. */
-  def sequenceFile[K: ClassTag, V: ClassTag](path: String,
+      vClass: Class[V]): RDD[(K, V)] = {
+    new NewHadoopRDD(this, fClass, kClass, vClass, conf)
+  }
+
+  /** Get an RDD for a Hadoop SequenceFile with given key and value types.
+    *
+    * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+    * record, directly caching the returned RDD will create many references to the same object.
+    * If you plan to directly cache Hadoop writable objects, you should first copy them using
+    * a `map` function.
+    */
+  def sequenceFile[K, V](path: String,
       keyClass: Class[K],
       valueClass: Class[V],
-      minSplits: Int,
-      cloneRecords: Boolean = true
+      minSplits: Int
       ): RDD[(K, V)] = {
     val inputFormatClass = classOf[SequenceFileInputFormat[K, V]]
-    hadoopFile(path, inputFormatClass, keyClass, valueClass, minSplits, cloneRecords)
+    hadoopFile(path, inputFormatClass, keyClass, valueClass, minSplits)
   }
 
-  /** Get an RDD for a Hadoop SequenceFile with given key and value types. */
-  def sequenceFile[K: ClassTag, V: ClassTag](path: String, keyClass: Class[K], valueClass: Class[V],
-      cloneRecords: Boolean = true): RDD[(K, V)] =
-    sequenceFile(path, keyClass, valueClass, defaultMinSplits, cloneRecords)
+  /** Get an RDD for a Hadoop SequenceFile with given key and value types.
+    *
+    * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+    * record, directly caching the returned RDD will create many references to the same object.
+    * If you plan to directly cache Hadoop writable objects, you should first copy them using
+    * a `map` function.
+    * */
+  def sequenceFile[K, V](path: String, keyClass: Class[K], valueClass: Class[V]
+      ): RDD[(K, V)] =
+    sequenceFile(path, keyClass, valueClass, defaultMinSplits)
 
   /**
    * Version of sequenceFile() for types implicitly convertible to Writables through a
@@ -500,9 +530,14 @@ class SparkContext(
    * have a parameterized singleton object). We use functions instead to create a new converter
    * for the appropriate type. In addition, we pass the converter a ClassTag of its type to
    * allow it to figure out the Writable class to use in the subclass case.
+   *
+   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * record, directly caching the returned RDD will create many references to the same object.
+   * If you plan to directly cache Hadoop writable objects, you should first copy them using
+   * a `map` function.
    */
    def sequenceFile[K, V]
-       (path: String, minSplits: Int = defaultMinSplits, cloneRecords: Boolean = true)
+       (path: String, minSplits: Int = defaultMinSplits)
        (implicit km: ClassTag[K], vm: ClassTag[V],
         kcf: () => WritableConverter[K], vcf: () => WritableConverter[V])
       : RDD[(K, V)] = {
@@ -511,7 +546,7 @@ class SparkContext(
     val format = classOf[SequenceFileInputFormat[Writable, Writable]]
     val writables = hadoopFile(path, format,
         kc.writableClass(km).asInstanceOf[Class[Writable]],
-        vc.writableClass(vm).asInstanceOf[Class[Writable]], minSplits, cloneRecords)
+        vc.writableClass(vm).asInstanceOf[Class[Writable]], minSplits)
     writables.map { case (k, v) => (kc.convert(k), vc.convert(v)) }
   }
 
@@ -708,8 +743,11 @@ class SparkContext(
                 env.httpFileServer.addJar(new File(fileName))
               } catch {
                 case e: Exception => {
+                  // For now just log an error but allow to go through so spark examples work.
+                  // The spark examples don't really need the jar distributed since its also 
+                  // the app jar.
                   logError("Error adding jar (" + e + "), was the --addJars option used?")
-                  throw e
+                  null
                 }
               }
             } else {
@@ -722,8 +760,10 @@ class SparkContext(
             path
         }
       }
-      addedJars(key) = System.currentTimeMillis
-      logInfo("Added JAR " + path + " at " + key + " with timestamp " + addedJars(key))
+      if (key != null) {
+        addedJars(key) = System.currentTimeMillis
+        logInfo("Added JAR " + path + " at " + key + " with timestamp " + addedJars(key))
+      }
     }
   }
 
@@ -956,6 +996,8 @@ class SparkContext(
     }
   }
 
+  def getCheckpointDir = checkpointDir
+
   /** Default level of parallelism to use when not given by user (e.g. parallelize and makeRDD). */
   def defaultParallelism: Int = taskScheduler.defaultParallelism
 
@@ -1017,7 +1059,7 @@ object SparkContext {
   implicit def rddToAsyncRDDActions[T: ClassTag](rdd: RDD[T]) = new AsyncRDDActions(rdd)
 
   implicit def rddToSequenceFileRDDFunctions[K <% Writable: ClassTag, V <% Writable: ClassTag](
-      rdd: RDD[(K, V)]) =
+      rdd: RDD[(K, V)])   =
     new SequenceFileRDDFunctions(rdd)
 
   implicit def rddToOrderedRDDFunctions[K <% Ordered[K]: ClassTag, V: ClassTag](
@@ -1125,7 +1167,7 @@ object SparkContext {
     if (sparkHome != null) {
       res.setSparkHome(sparkHome)
     }
-    if (!jars.isEmpty) {
+    if (jars != null && !jars.isEmpty) {
       res.setJars(jars)
     }
     res.setExecutorEnv(environment.toSeq)
diff --git a/core/src/main/scala/org/apache/spark/TaskContext.scala b/core/src/main/scala/org/apache/spark/TaskContext.scala
index cae983ed4c652..be53ca2968cfb 100644
--- a/core/src/main/scala/org/apache/spark/TaskContext.scala
+++ b/core/src/main/scala/org/apache/spark/TaskContext.scala
@@ -46,6 +46,7 @@ class TaskContext(
   }
 
   def executeOnCompleteCallbacks() {
-    onCompleteCallbacks.foreach{_()}
+    // Process complete callbacks in the reverse order of registration
+    onCompleteCallbacks.reverse.foreach{_()}
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
index faf6dcd618623..3fd6f5eb472f4 100644
--- a/core/src/main/scala/org/apache/spark/TaskEndReason.scala
+++ b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
@@ -53,3 +53,16 @@ private[spark] case class ExceptionFailure(
 private[spark] case object TaskResultLost extends TaskEndReason
 
 private[spark] case object TaskKilled extends TaskEndReason
+
+/**
+ * The task failed because the executor that it was running on was lost. This may happen because
+ * the task crashed the JVM.
+ */
+private[spark] case object ExecutorLostFailure extends TaskEndReason
+
+/**
+ * We don't know why the task ended -- for example, because of a ClassNotFound exception when
+ * deserializing the task result.
+ */
+private[spark] case object UnknownReason extends TaskEndReason
+
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
index 0fb7e195b34c4..f430a33db1e4a 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala
@@ -49,8 +49,7 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])(implicit val kClassTag: ClassTag[K
 
   override def wrapRDD(rdd: RDD[(K, V)]): JavaPairRDD[K, V] = JavaPairRDD.fromRDD(rdd)
 
-  override val classTag: ClassTag[(K, V)] =
-    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[Tuple2[K, V]]]
+  override val classTag: ClassTag[(K, V)] = rdd.elementClassTag
 
   import JavaPairRDD._
 
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
index ebbbbd88061a1..0818ee4dbc801 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -88,7 +88,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
    * Return a new RDD by applying a function to all elements of this RDD.
    */
   def map[K2, V2](f: PairFunction[T, K2, V2]): JavaPairRDD[K2, V2] = {
-    def cm = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[Tuple2[K2, V2]]]
+    def cm = implicitly[ClassTag[Tuple2[_, _]]].asInstanceOf[ClassTag[Tuple2[K2, V2]]]
     new JavaPairRDD(rdd.map(f)(cm))(f.keyType(), f.valueType())
   }
 
@@ -119,7 +119,7 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
   def flatMap[K2, V2](f: PairFlatMapFunction[T, K2, V2]): JavaPairRDD[K2, V2] = {
     import scala.collection.JavaConverters._
     def fn = (x: T) => f.apply(x).asScala
-    def cm = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[Tuple2[K2, V2]]]
+    def cm = implicitly[ClassTag[Tuple2[_, _]]].asInstanceOf[ClassTag[Tuple2[K2, V2]]]
     JavaPairRDD.fromRDD(rdd.flatMap(fn)(cm))(f.keyType(), f.valueType())
   }
 
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
index 7a6f044965027..c777472cd7310 100644
--- a/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/api/java/JavaSparkContext.scala
@@ -34,11 +34,11 @@ import org.apache.spark.SparkContext.IntAccumulatorParam
 import org.apache.spark.SparkContext.DoubleAccumulatorParam
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
-import scala.Tuple2
+
 
 /**
- * A Java-friendly version of [[org.apache.spark.SparkContext]] that returns [[org.apache.spark.api.java.JavaRDD]]s and
- * works with Java collections instead of Scala ones.
+ * A Java-friendly version of [[org.apache.spark.SparkContext]] that returns
+ * [[org.apache.spark.api.java.JavaRDD]]s and works with Java collections instead of Scala ones.
  */
 class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWorkaround {
   /**
@@ -137,7 +137,13 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
    */
   def textFile(path: String, minSplits: Int): JavaRDD[String] = sc.textFile(path, minSplits)
 
-  /**Get an RDD for a Hadoop SequenceFile with given key and value types. */
+  /** Get an RDD for a Hadoop SequenceFile with given key and value types.
+    *
+    * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+    * record, directly caching the returned RDD will create many references to the same object.
+    * If you plan to directly cache Hadoop writable objects, you should first copy them using
+    * a `map` function.
+    * */
   def sequenceFile[K, V](path: String,
     keyClass: Class[K],
     valueClass: Class[V],
@@ -148,7 +154,13 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
     new JavaPairRDD(sc.sequenceFile(path, keyClass, valueClass, minSplits))
   }
 
-  /**Get an RDD for a Hadoop SequenceFile. */
+  /** Get an RDD for a Hadoop SequenceFile.
+    *
+    * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+    * record, directly caching the returned RDD will create many references to the same object.
+    * If you plan to directly cache Hadoop writable objects, you should first copy them using
+    * a `map` function.
+    */
   def sequenceFile[K, V](path: String, keyClass: Class[K], valueClass: Class[V]):
   JavaPairRDD[K, V] = {
     implicit val kcm: ClassTag[K] = ClassTag(keyClass)
@@ -184,6 +196,11 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
    * Get an RDD for a Hadoop-readable dataset from a Hadooop JobConf giving its InputFormat and any
    * other necessary info (e.g. file name for a filesystem-based dataset, table name for HyperTable,
    * etc).
+   *
+   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * record, directly caching the returned RDD will create many references to the same object.
+   * If you plan to directly cache Hadoop writable objects, you should first copy them using
+   * a `map` function.
    */
   def hadoopRDD[K, V, F <: InputFormat[K, V]](
     conf: JobConf,
@@ -201,6 +218,11 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
    * Get an RDD for a Hadoop-readable dataset from a Hadooop JobConf giving its InputFormat and any
    * other necessary info (e.g. file name for a filesystem-based dataset, table name for HyperTable,
    * etc).
+   *
+   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * record, directly caching the returned RDD will create many references to the same object.
+   * If you plan to directly cache Hadoop writable objects, you should first copy them using
+   * a `map` function.
    */
   def hadoopRDD[K, V, F <: InputFormat[K, V]](
     conf: JobConf,
@@ -213,7 +235,13 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
     new JavaPairRDD(sc.hadoopRDD(conf, inputFormatClass, keyClass, valueClass))
   }
 
-  /** Get an RDD for a Hadoop file with an arbitrary InputFormat */
+  /** Get an RDD for a Hadoop file with an arbitrary InputFormat.
+    *
+    * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+    * record, directly caching the returned RDD will create many references to the same object.
+    * If you plan to directly cache Hadoop writable objects, you should first copy them using
+    * a `map` function.
+    */
   def hadoopFile[K, V, F <: InputFormat[K, V]](
     path: String,
     inputFormatClass: Class[F],
@@ -226,7 +254,13 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
     new JavaPairRDD(sc.hadoopFile(path, inputFormatClass, keyClass, valueClass, minSplits))
   }
 
-  /** Get an RDD for a Hadoop file with an arbitrary InputFormat */
+  /** Get an RDD for a Hadoop file with an arbitrary InputFormat
+    *
+    * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+    * record, directly caching the returned RDD will create many references to the same object.
+    * If you plan to directly cache Hadoop writable objects, you should first copy them using
+    * a `map` function.
+    */
   def hadoopFile[K, V, F <: InputFormat[K, V]](
     path: String,
     inputFormatClass: Class[F],
@@ -242,6 +276,11 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
   /**
    * Get an RDD for a given Hadoop file with an arbitrary new API InputFormat
    * and extra configuration options to pass to the input format.
+   *
+   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * record, directly caching the returned RDD will create many references to the same object.
+   * If you plan to directly cache Hadoop writable objects, you should first copy them using
+   * a `map` function.
    */
   def newAPIHadoopFile[K, V, F <: NewInputFormat[K, V]](
     path: String,
@@ -257,6 +296,11 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
   /**
    * Get an RDD for a given Hadoop file with an arbitrary new API InputFormat
    * and extra configuration options to pass to the input format.
+   *
+   * '''Note:''' Because Hadoop's RecordReader class re-uses the same Writable object for each
+   * record, directly caching the returned RDD will create many references to the same object.
+   * If you plan to directly cache Hadoop writable objects, you should first copy them using
+   * a `map` function.
    */
   def newAPIHadoopRDD[K, V, F <: NewInputFormat[K, V]](
     conf: Configuration,
@@ -333,8 +377,9 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
     sc.accumulable(initialValue)(param)
 
   /**
-   * Broadcast a read-only variable to the cluster, returning a [[org.apache.spark.Broadcast]] object for
-   * reading it in distributed functions. The variable will be sent to each cluster only once.
+   * Broadcast a read-only variable to the cluster, returning a
+   * [[org.apache.spark.broadcast.Broadcast]] object for reading it in distributed functions.
+   * The variable will be sent to each cluster only once.
    */
   def broadcast[T](value: T): Broadcast[T] = sc.broadcast(value)
 
@@ -400,6 +445,8 @@ class JavaSparkContext(val sc: SparkContext) extends JavaSparkContextVarargsWork
     sc.setCheckpointDir(dir)
   }
 
+  def getCheckpointDir = JavaUtils.optionToOptional(sc.getCheckpointDir)
+
   protected def checkpointFile[T](path: String): JavaRDD[T] = {
     implicit val cm: ClassTag[T] =
       implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[T]]
diff --git a/core/src/main/scala/org/apache/spark/api/java/package.scala b/core/src/main/scala/org/apache/spark/api/java/package.scala
new file mode 100644
index 0000000000000..8ec770046abe9
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/api/java/package.scala
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api
+
+/** Spark Java programming APIs. */
+package object java {
+  // For package docs only
+}
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 82527fe663848..e03c6f9c184c9 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -52,6 +52,8 @@ private[spark] class PythonRDD[T: ClassTag](
     val env = SparkEnv.get
     val worker = env.createPythonWorker(pythonExec, envVars.toMap)
 
+    @volatile var readerException: Exception = null
+
     // Start a thread to feed the process input from our parent's iterator
     new Thread("stdin writer for " + pythonExec) {
       override def run() {
@@ -62,7 +64,7 @@ private[spark] class PythonRDD[T: ClassTag](
           // Partition index
           dataOut.writeInt(split.index)
           // sparkFilesDir
-          dataOut.writeUTF(SparkFiles.getRootDirectory)
+          PythonRDD.writeUTF(SparkFiles.getRootDirectory, dataOut)
           // Broadcast variables
           dataOut.writeInt(broadcastVars.length)
           for (broadcast <- broadcastVars) {
@@ -72,18 +74,22 @@ private[spark] class PythonRDD[T: ClassTag](
           }
           // Python includes (*.zip and *.egg files)
           dataOut.writeInt(pythonIncludes.length)
-          pythonIncludes.foreach(dataOut.writeUTF)
+          for (include <- pythonIncludes) {
+            PythonRDD.writeUTF(include, dataOut)
+          }
           dataOut.flush()
           // Serialized command:
           dataOut.writeInt(command.length)
           dataOut.write(command)
           // Data values
-          for (elem <- parent.iterator(split, context)) {
-            PythonRDD.writeToStream(elem, dataOut)
-          }
+          PythonRDD.writeIteratorToStream(parent.iterator(split, context), dataOut)
           dataOut.flush()
           worker.shutdownOutput()
         } catch {
+          case e: java.io.FileNotFoundException =>
+            readerException = e
+            // Kill the Python worker process:
+            worker.shutdownOutput()
           case e: IOException =>
             // This can happen for legitimate reasons if the Python code stops returning data before we are done
             // passing elements through, e.g., for take(). Just log a message to say it happened.
@@ -93,6 +99,14 @@ private[spark] class PythonRDD[T: ClassTag](
       }
     }.start()
 
+    /*
+     * Partial fix for SPARK-1019: Attempts to stop reading the input stream since
+     * other completion callbacks might invalidate the input. Because interruption
+     * is not synchronous this still leaves a potential race where the interruption is
+     * processed only after the stream becomes invalid.
+     */
+    context.addOnCompleteCallback(() => context.interrupted = true)
+
     // Return an iterator that read lines from the process's stdout
     val stream = new DataInputStream(new BufferedInputStream(worker.getInputStream, bufferSize))
     val stdoutIterator = new Iterator[Array[Byte]] {
@@ -108,6 +122,9 @@ private[spark] class PythonRDD[T: ClassTag](
       }
 
       private def read(): Array[Byte] = {
+        if (readerException != null) {
+          throw readerException
+        }
         try {
           stream.readInt() match {
             case length if length > 0 =>
@@ -206,23 +223,52 @@ private[spark] object PythonRDD {
     JavaRDD.fromRDD(sc.sc.parallelize(objs, parallelism))
   }
 
-  def writeToStream(elem: Any, dataOut: DataOutputStream) {
-    elem match {
-      case bytes: Array[Byte] =>
-        dataOut.writeInt(bytes.length)
-        dataOut.write(bytes)
-      case pair: (Array[Byte], Array[Byte]) =>
-        dataOut.writeInt(pair._1.length)
-        dataOut.write(pair._1)
-        dataOut.writeInt(pair._2.length)
-        dataOut.write(pair._2)
-      case str: String =>
-        dataOut.writeUTF(str)
-      case other =>
-        throw new SparkException("Unexpected element type " + other.getClass)
+  def writeIteratorToStream[T](iter: Iterator[T], dataOut: DataOutputStream) {
+    // The right way to implement this would be to use TypeTags to get the full
+    // type of T.  Since I don't want to introduce breaking changes throughout the
+    // entire Spark API, I have to use this hacky approach:
+    if (iter.hasNext) {
+      val first = iter.next()
+      val newIter = Seq(first).iterator ++ iter
+      first match {
+        case arr: Array[Byte] =>
+          newIter.asInstanceOf[Iterator[Array[Byte]]].foreach { bytes =>
+            dataOut.writeInt(bytes.length)
+            dataOut.write(bytes)
+          }
+        case string: String =>
+          newIter.asInstanceOf[Iterator[String]].foreach { str =>
+            writeUTF(str, dataOut)
+          }
+        case pair: Tuple2[_, _] =>
+          pair._1 match {
+            case bytePair: Array[Byte] =>
+              newIter.asInstanceOf[Iterator[Tuple2[Array[Byte], Array[Byte]]]].foreach { pair =>
+                dataOut.writeInt(pair._1.length)
+                dataOut.write(pair._1)
+                dataOut.writeInt(pair._2.length)
+                dataOut.write(pair._2)
+              }
+            case stringPair: String =>
+              newIter.asInstanceOf[Iterator[Tuple2[String, String]]].foreach { pair =>
+                writeUTF(pair._1, dataOut)
+                writeUTF(pair._2, dataOut)
+              }
+            case other =>
+              throw new SparkException("Unexpected Tuple2 element type " + pair._1.getClass)
+          }
+        case other =>
+          throw new SparkException("Unexpected element type " + first.getClass)
+      }
     }
   }
 
+  def writeUTF(str: String, dataOut: DataOutputStream) {
+    val bytes = str.getBytes("UTF-8")
+    dataOut.writeInt(bytes.length)
+    dataOut.write(bytes)
+  }
+
   def writeToFile[T](items: java.util.Iterator[T], filename: String) {
     import scala.collection.JavaConverters._
     writeToFile(items.asScala, filename)
@@ -230,9 +276,7 @@ private[spark] object PythonRDD {
 
   def writeToFile[T](items: Iterator[T], filename: String) {
     val file = new DataOutputStream(new FileOutputStream(filename))
-    for (item <- items) {
-      writeToStream(item, file)
-    }
+    writeIteratorToStream(items, file)
     file.close()
   }
 
diff --git a/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala
index 6bfe2cb4a29cf..d113d4040594d 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala
@@ -17,13 +17,40 @@
 
 package org.apache.spark.broadcast
 
-import java.io._
+import java.io.Serializable
 import java.util.concurrent.atomic.AtomicLong
 
 import org.apache.spark._
 
-private[spark]
-abstract class Broadcast[T](private[spark] val id: Long) extends Serializable {
+/**
+ * A broadcast variable. Broadcast variables allow the programmer to keep a read-only variable
+ * cached on each machine rather than shipping a copy of it with tasks. They can be used, for
+ * example, to give every node a copy of a large input dataset in an efficient manner. Spark also
+ * attempts to distribute broadcast variables using efficient broadcast algorithms to reduce
+ * communication cost.
+ *
+ * Broadcast variables are created from a variable `v` by calling [[SparkContext#broadcast]].
+ * The broadcast variable is a wrapper around `v`, and its value can be accessed by calling the
+ * `value` method. The interpreter session below shows this:
+ *
+ * {{{
+ * scala> val broadcastVar = sc.broadcast(Array(1, 2, 3))
+ * broadcastVar: spark.Broadcast[Array[Int]] = spark.Broadcast(b5c40191-a864-4c7d-b9bf-d87e1a4e787c)
+ *
+ * scala> broadcastVar.value
+ * res0: Array[Int] = Array(1, 2, 3)
+ * }}}
+ *
+ * After the broadcast variable is created, it should be used instead of the value `v` in any
+ * functions run on the cluster so that `v` is not shipped to the nodes more than once.
+ * In addition, the object `v` should not be modified after it is broadcast in order to ensure
+ * that all nodes get the same value of the broadcast variable (e.g. if the variable is shipped
+ * to a new node later).
+ *
+ * @param id A unique identifier for the broadcast variable.
+ * @tparam T Type of the data contained in the broadcast variable.
+ */
+abstract class Broadcast[T](val id: Long) extends Serializable {
   def value: T
 
   // We cannot have an abstract readObject here due to some weird issues with
diff --git a/core/src/main/scala/org/apache/spark/broadcast/package.scala b/core/src/main/scala/org/apache/spark/broadcast/package.scala
new file mode 100644
index 0000000000000..01bf88629a7dd
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/broadcast/package.scala
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+/**
+ * Package for broadcast variables. See [[broadcast.Broadcast]] for details.
+ */
+package object broadcast {
+  // For package docs only
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala
index db67c6d1bb55c..13a0f1fa9b767 100644
--- a/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/ClientArguments.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.deploy
 
-import java.net.URL
-
 import scala.collection.mutable.ListBuffer
 
 import org.apache.log4j.Level
@@ -71,13 +69,10 @@ private[spark] class ClientArguments(args: Array[String]) {
     case "launch" :: _master :: _jarUrl :: _mainClass :: tail =>
       cmd = "launch"
 
-      try {
-        new URL(_jarUrl)
-      } catch {
-        case e: Exception =>
-          println(s"Jar url '${_jarUrl}' is not a valid URL.")
-          println(s"Jar must be in URL format (e.g. hdfs://XX, file://XX)")
-          printUsageAndExit(-1)
+      if (!ClientArguments.isValidJarUrl(_jarUrl)) {
+        println(s"Jar url '${_jarUrl}' is not in valid format.")
+        println(s"Must be a jar file path in URL format (e.g. hdfs://XX.jar, file://XX.jar)")
+        printUsageAndExit(-1)
       }
 
       jarUrl = _jarUrl
@@ -115,3 +110,7 @@ private[spark] class ClientArguments(args: Array[String]) {
     System.exit(exitCode)
   }
 }
+
+object ClientArguments {
+  def isValidJarUrl(s: String) = s.matches("(.+):(.+)jar")
+}
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 27dc42bf7e50e..b479225b45ee9 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -28,7 +28,6 @@ import org.apache.spark.{SparkContext, SparkException}
 /**
  * Contains util methods to interact with Hadoop from Spark.
  */
-private[spark]
 class SparkHadoopUtil {
   val conf = newConfiguration()
   UserGroupInformation.setConfiguration(conf)
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index d9ea96afcf52a..389715662beee 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -168,7 +168,7 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act
 
     case RegisterWorker(id, workerHost, workerPort, cores, memory, workerWebUiPort, publicAddress) => {
       logInfo("Registering worker %s:%d with %d cores, %s RAM".format(
-        host, workerPort, cores, Utils.megabytesToString(memory)))
+        workerHost, workerPort, cores, Utils.megabytesToString(memory)))
       if (state == RecoveryState.STANDBY) {
         // ignore, don't send response
       } else if (idToWorker.contains(id)) {
@@ -515,7 +515,7 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act
     // There may be one or more refs to dead workers on this same node (w/ different ID's),
     // remove them.
     workers.filter { w =>
-      (w.host == host && w.port == port) && (w.state == WorkerState.DEAD)
+      (w.host == worker.host && w.port == worker.port) && (w.state == WorkerState.DEAD)
     }.foreach { w =>
       workers -= w
     }
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala
index f24f49ea8ad9f..10816a1f43ff6 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala
@@ -87,7 +87,7 @@ class ZooKeeperPersistenceEngine(serialization: Serialization, conf: SparkConf)
   }
 
   def deserializeFromFile[T](filename: String)(implicit m: Manifest[T]): T = {
-    val fileData = zk.getData("/spark/master_status/" + filename)
+    val fileData = zk.getData(WORKING_DIR + "/" + filename)
     val clazz = m.runtimeClass.asInstanceOf[Class[T]]
     val serializer = serialization.serializerFor(clazz)
     serializer.fromBinary(fileData).asInstanceOf[T]
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
index 9485bfd89eb57..f29a6ad2e7b92 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/ApplicationPage.scala
@@ -67,7 +67,7 @@ private[spark] class ApplicationPage(parent: MasterWebUI) {
               <li><strong>User:</strong> {app.desc.user}</li>
               <li><strong>Cores:</strong>
                 {
-                if (app.desc.maxCores == None) {
+                if (app.desc.maxCores.isEmpty) {
                   "Unlimited (%s granted)".format(app.coresGranted)
                 } else {
                   "%s (%s granted, %s left)".format(
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
index cf6a23339d961..460883ec7ae24 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/CommandUtils.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.deploy.worker
 
 import java.io.{File, FileOutputStream, IOException, InputStream}
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
index 1640d5fee0f77..6f6c101547c3c 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/DriverWrapper.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.deploy.worker
 
 import akka.actor._
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
old mode 100644
new mode 100755
index 5182dcbb2abfd..36bb28912fbda
--- a/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/Worker.scala
@@ -209,8 +209,11 @@ private[spark] class Worker(
         logWarning("Invalid Master (" + masterUrl + ") attempted to launch executor.")
       } else {
         logInfo("Asked to launch executor %s/%d for %s".format(appId, execId, appDesc.name))
+        // TODO (pwendell): We shuld make sparkHome an Option[String] in
+        // ApplicationDescription to be more explicit about this.
+        val effectiveSparkHome = Option(execSparkHome_).getOrElse(sparkHome.getAbsolutePath)
         val manager = new ExecutorRunner(appId, execId, appDesc, cores_, memory_,
-          self, workerId, host, new File(execSparkHome_), workDir, akkaUrl, ExecutorState.RUNNING)
+          self, workerId, host, new File(effectiveSparkHome), workDir, akkaUrl, ExecutorState.RUNNING)
         executors(appId + "/" + execId) = manager
         manager.start()
         coresUsed += cores_
@@ -311,6 +314,7 @@ private[spark] class Worker(
 
   override def postStop() {
     executors.values.foreach(_.kill())
+    drivers.values.foreach(_.kill())
     webUi.stop()
     metricsSystem.stop()
   }
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
index 0e0d0cd6264cf..1dc39c450ea16 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.deploy.worker
 
 import akka.actor.{Actor, Address, AddressFromURIString}
diff --git a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala
index 6f9f29969eaec..e54ac0b332093 100644
--- a/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala
+++ b/core/src/main/scala/org/apache/spark/metrics/MetricsConfig.scala
@@ -80,7 +80,7 @@ private[spark] class MetricsConfig(val configFile: Option[String]) extends Loggi
     val subProperties = new mutable.HashMap[String, Properties]
     import scala.collection.JavaConversions._
     prop.foreach { kv =>
-      if (regex.findPrefixOf(kv._1) != None) {
+      if (regex.findPrefixOf(kv._1).isDefined) {
         val regex(prefix, suffix) = kv._1
         subProperties.getOrElseUpdate(prefix, new Properties).setProperty(suffix, kv._2)
       }
diff --git a/core/src/main/scala/org/apache/spark/partial/ApproximateActionListener.scala b/core/src/main/scala/org/apache/spark/partial/ApproximateActionListener.scala
index d71069444a73f..423ff67a5fd43 100644
--- a/core/src/main/scala/org/apache/spark/partial/ApproximateActionListener.scala
+++ b/core/src/main/scala/org/apache/spark/partial/ApproximateActionListener.scala
@@ -71,7 +71,7 @@ private[spark] class ApproximateActionListener[T, U, R](
     val finishTime = startTime + timeout
     while (true) {
       val time = System.currentTimeMillis()
-      if (failure != None) {
+      if (failure.isDefined) {
         throw failure.get
       } else if (finishedTasks == totalTasks) {
         return new PartialResult(evaluator.currentResult(), true)
diff --git a/core/src/main/scala/org/apache/spark/partial/PartialResult.scala b/core/src/main/scala/org/apache/spark/partial/PartialResult.scala
index 5ce49b8100ee6..812368e04ac0d 100644
--- a/core/src/main/scala/org/apache/spark/partial/PartialResult.scala
+++ b/core/src/main/scala/org/apache/spark/partial/PartialResult.scala
@@ -31,10 +31,10 @@ class PartialResult[R](initialVal: R, isFinal: Boolean) {
    * Blocking method to wait for and return the final value.
    */
   def getFinalValue(): R = synchronized {
-    while (finalValue == None && failure == None) {
+    while (finalValue.isEmpty && failure.isEmpty) {
       this.wait()
     }
-    if (finalValue != None) {
+    if (finalValue.isDefined) {
       return finalValue.get
     } else {
       throw failure.get
@@ -46,11 +46,11 @@ class PartialResult[R](initialVal: R, isFinal: Boolean) {
    * is supported per PartialResult.
    */
   def onComplete(handler: R => Unit): PartialResult[R] = synchronized {
-    if (completionHandler != None) {
+    if (completionHandler.isDefined) {
       throw new UnsupportedOperationException("onComplete cannot be called twice")
     }
     completionHandler = Some(handler)
-    if (finalValue != None) {
+    if (finalValue.isDefined) {
       // We already have a final value, so let's call the handler
       handler(finalValue.get)
     }
@@ -63,11 +63,11 @@ class PartialResult[R](initialVal: R, isFinal: Boolean) {
    */
   def onFail(handler: Exception => Unit) {
     synchronized {
-      if (failureHandler != None) {
+      if (failureHandler.isDefined) {
         throw new UnsupportedOperationException("onFail cannot be called twice")
       }
       failureHandler = Some(handler)
-      if (failure != None) {
+      if (failure.isDefined) {
         // We already have a failure, so let's call the handler
         handler(failure.get)
       }
@@ -102,7 +102,7 @@ class PartialResult[R](initialVal: R, isFinal: Boolean) {
 
   private[spark] def setFinalValue(value: R) {
     synchronized {
-      if (finalValue != None) {
+      if (finalValue.isDefined) {
         throw new UnsupportedOperationException("setFinalValue called twice on a PartialResult")
       }
       finalValue = Some(value)
@@ -117,7 +117,7 @@ class PartialResult[R](initialVal: R, isFinal: Boolean) {
 
   private[spark] def setFailure(exception: Exception) {
     synchronized {
-      if (failure != None) {
+      if (failure.isDefined) {
         throw new UnsupportedOperationException("setFailure called twice on a PartialResult")
       }
       failure = Some(exception)
diff --git a/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
index 83109d1a6f853..30e578dd93e8d 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CheckpointRDD.scala
@@ -43,8 +43,8 @@ class CheckpointRDD[T: ClassTag](sc: SparkContext, val checkpointPath: String)
     val numPartitions =
     // listStatus can throw exception if path does not exist.
     if (fs.exists(cpath)) {
-      val dirContents = fs.listStatus(cpath)
-      val partitionFiles = dirContents.map(_.getPath.toString).filter(_.contains("part-")).sorted
+      val dirContents = fs.listStatus(cpath).map(_.getPath)
+      val partitionFiles = dirContents.filter(_.getName.startsWith("part-")).map(_.toString).sorted
       val numPart =  partitionFiles.size
       if (numPart > 0 && (! partitionFiles(0).endsWith(CheckpointRDD.splitIdToFile(0)) ||
           ! partitionFiles(numPart-1).endsWith(CheckpointRDD.splitIdToFile(numPart-1)))) {
diff --git a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
index 9c6b308804c77..0e47f2e022610 100644
--- a/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/CoGroupedRDD.scala
@@ -66,7 +66,6 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part:
   private type CoGroupValue = (Any, Int)  // Int is dependency number
   private type CoGroupCombiner = Seq[CoGroup]
 
-  private val sparkConf = SparkEnv.get.conf
   private var serializerClass: String = null
 
   def setSerializer(cls: String): CoGroupedRDD[K] = {
@@ -106,8 +105,8 @@ class CoGroupedRDD[K](@transient var rdds: Seq[RDD[_ <: Product2[K, _]]], part:
   override val partitioner = Some(part)
 
   override def compute(s: Partition, context: TaskContext): Iterator[(K, CoGroupCombiner)] = {
-
-    val externalSorting = sparkConf.getBoolean("spark.shuffle.externalSorting", true)
+    val sparkConf = SparkEnv.get.conf
+    val externalSorting = sparkConf.getBoolean("spark.shuffle.spill", true)
     val split = s.asInstanceOf[CoGroupPartition]
     val numRdds = split.deps.size
 
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
index dbe76f34316ae..ad74d4636fb1b 100644
--- a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -19,10 +19,7 @@ package org.apache.spark.rdd
 
 import java.io.EOFException
 
-import scala.reflect.ClassTag
-
 import org.apache.hadoop.conf.{Configuration, Configurable}
-import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapred.InputFormat
 import org.apache.hadoop.mapred.InputSplit
 import org.apache.hadoop.mapred.JobConf
@@ -34,7 +31,6 @@ import org.apache.spark._
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.util.NextIterator
-import org.apache.spark.util.Utils.cloneWritables
 
 
 /**
@@ -64,21 +60,15 @@ private[spark] class HadoopPartition(rddId: Int, idx: Int, @transient s: InputSp
  * @param keyClass Class of the key associated with the inputFormatClass.
  * @param valueClass Class of the value associated with the inputFormatClass.
  * @param minSplits Minimum number of Hadoop Splits (HadoopRDD partitions) to generate.
- * @param cloneRecords If true, Spark will clone the records produced by Hadoop RecordReader.
- *                     Most RecordReader implementations reuse wrapper objects across multiple
- *                     records, and can cause problems in RDD collect or aggregation operations.
- *                     By default the records are cloned in Spark. However, application
- *                     programmers can explicitly disable the cloning for better performance.
  */
-class HadoopRDD[K: ClassTag, V: ClassTag](
+class HadoopRDD[K, V](
     sc: SparkContext,
     broadcastedConf: Broadcast[SerializableWritable[Configuration]],
     initLocalJobConfFuncOpt: Option[JobConf => Unit],
     inputFormatClass: Class[_ <: InputFormat[K, V]],
     keyClass: Class[K],
     valueClass: Class[V],
-    minSplits: Int,
-    cloneRecords: Boolean = true)
+    minSplits: Int)
   extends RDD[(K, V)](sc, Nil) with Logging {
 
   def this(
@@ -87,8 +77,7 @@ class HadoopRDD[K: ClassTag, V: ClassTag](
       inputFormatClass: Class[_ <: InputFormat[K, V]],
       keyClass: Class[K],
       valueClass: Class[V],
-      minSplits: Int,
-      cloneRecords: Boolean) = {
+      minSplits: Int) = {
     this(
       sc,
       sc.broadcast(new SerializableWritable(conf))
@@ -97,8 +86,7 @@ class HadoopRDD[K: ClassTag, V: ClassTag](
       inputFormatClass,
       keyClass,
       valueClass,
-      minSplits,
-      cloneRecords)
+      minSplits)
   }
 
   protected val jobConfCacheKey = "rdd_%d_job_conf".format(id)
@@ -170,9 +158,7 @@ class HadoopRDD[K: ClassTag, V: ClassTag](
       // Register an on-task-completion callback to close the input stream.
       context.addOnCompleteCallback{ () => closeIfNeeded() }
       val key: K = reader.createKey()
-      val keyCloneFunc = cloneWritables[K](jobConf)
       val value: V = reader.createValue()
-      val valueCloneFunc = cloneWritables[V](jobConf)
       override def getNext() = {
         try {
           finished = !reader.next(key, value)
@@ -180,11 +166,7 @@ class HadoopRDD[K: ClassTag, V: ClassTag](
           case eof: EOFException =>
             finished = true
         }
-        if (cloneRecords) {
-          (keyCloneFunc(key.asInstanceOf[Writable]), valueCloneFunc(value.asInstanceOf[Writable]))
-        } else {
-          (key, value)
-        }
+        (key, value)
       }
 
       override def close() {
diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
index 992bd4aa0ad5d..d1fff296878c3 100644
--- a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -20,15 +20,11 @@ package org.apache.spark.rdd
 import java.text.SimpleDateFormat
 import java.util.Date
 
-import scala.reflect.ClassTag
-
 import org.apache.hadoop.conf.{Configurable, Configuration}
 import org.apache.hadoop.io.Writable
 import org.apache.hadoop.mapreduce._
 
 import org.apache.spark.{InterruptibleIterator, Logging, Partition, SerializableWritable, SparkContext, TaskContext}
-import org.apache.spark.util.Utils.cloneWritables
-
 
 private[spark]
 class NewHadoopPartition(rddId: Int, val index: Int, @transient rawSplit: InputSplit with Writable)
@@ -48,19 +44,13 @@ class NewHadoopPartition(rddId: Int, val index: Int, @transient rawSplit: InputS
  * @param keyClass Class of the key associated with the inputFormatClass.
  * @param valueClass Class of the value associated with the inputFormatClass.
  * @param conf The Hadoop configuration.
- * @param cloneRecords If true, Spark will clone the records produced by Hadoop RecordReader.
- *                     Most RecordReader implementations reuse wrapper objects across multiple
- *                     records, and can cause problems in RDD collect or aggregation operations.
- *                     By default the records are cloned in Spark. However, application
- *                     programmers can explicitly disable the cloning for better performance.
  */
-class NewHadoopRDD[K: ClassTag, V: ClassTag](
+class NewHadoopRDD[K, V](
     sc : SparkContext,
     inputFormatClass: Class[_ <: InputFormat[K, V]],
     keyClass: Class[K],
     valueClass: Class[V],
-    @transient conf: Configuration,
-    cloneRecords: Boolean)
+    @transient conf: Configuration)
   extends RDD[(K, V)](sc, Nil)
   with SparkHadoopMapReduceUtil
   with Logging {
@@ -107,8 +97,6 @@ class NewHadoopRDD[K: ClassTag, V: ClassTag](
 
       // Register an on-task-completion callback to close the input stream.
       context.addOnCompleteCallback(() => close())
-      val keyCloneFunc = cloneWritables[K](conf)
-      val valueCloneFunc = cloneWritables[V](conf)
       var havePair = false
       var finished = false
 
@@ -125,13 +113,7 @@ class NewHadoopRDD[K: ClassTag, V: ClassTag](
           throw new java.util.NoSuchElementException("End of stream")
         }
         havePair = false
-        val key = reader.getCurrentKey
-        val value = reader.getCurrentValue
-        if (cloneRecords) {
-          (keyCloneFunc(key.asInstanceOf[Writable]), valueCloneFunc(value.asInstanceOf[Writable]))
-        } else {
-          (key, value)
-        }
+        (reader.getCurrentKey, reader.getCurrentValue)
       }
 
       private def close() {
diff --git a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
index 4148581f527fe..0b2917b80ab00 100644
--- a/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala
@@ -28,7 +28,7 @@ import scala.collection.mutable.ArrayBuffer
 import scala.collection.JavaConversions._
 import scala.reflect.{ClassTag, classTag}
 
-import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.conf.{Configurable, Configuration}
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.io.SequenceFile.CompressionType
 import org.apache.hadoop.io.compress.CompressionCodec
@@ -77,6 +77,7 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
       partitioner: Partitioner,
       mapSideCombine: Boolean = true,
       serializerClass: String = null): RDD[(K, C)] = {
+    require(mergeCombiners != null, "mergeCombiners must be defined") // required as of Spark 0.9.0
     if (getKeyClass().isArray) {
       if (mapSideCombine) {
         throw new SparkException("Cannot use map-side combining with array keys.")
@@ -619,6 +620,10 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
         attemptNumber)
       val hadoopContext = newTaskAttemptContext(wrappedConf.value, attemptId)
       val format = outputFormatClass.newInstance
+      format match {
+        case c: Configurable => c.setConf(wrappedConf.value)
+        case _ => ()
+      }
       val committer = format.getOutputCommitter(hadoopContext)
       committer.setupTask(hadoopContext)
       val writer = format.getRecordWriter(hadoopContext).asInstanceOf[NewRecordWriter[K,V]]
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
index cd90a1561a975..1472c92b6031d 100644
--- a/core/src/main/scala/org/apache/spark/rdd/RDD.scala
+++ b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -666,7 +666,7 @@ abstract class RDD[T: ClassTag](
     }
     var jobResult: Option[T] = None
     val mergeResult = (index: Int, taskResult: Option[T]) => {
-      if (taskResult != None) {
+      if (taskResult.isDefined) {
         jobResult = jobResult match {
           case Some(value) => Some(f(value, taskResult.get))
           case None => taskResult
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 7046c06d2057d..536d84f07e5ec 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -155,7 +155,6 @@ class DAGScheduler(
   val failed = new HashSet[Stage]  // Stages that must be resubmitted due to fetch failures
   // Missing tasks from each stage
   val pendingTasks = new TimeStampedHashMap[Stage, HashSet[Task[_]]]
-  var lastFetchFailureTime: Long = 0  // Used to wait a bit to avoid repeated resubmits
 
   val activeJobs = new HashSet[ActiveJob]
   val resultStageToJob = new HashMap[Stage, ActiveJob]
@@ -176,22 +175,6 @@ class DAGScheduler(
    */
   def start() {
     eventProcessActor = env.actorSystem.actorOf(Props(new Actor {
-      /**
-       * A handle to the periodical task, used to cancel the task when the actor is stopped.
-       */
-      var resubmissionTask: Cancellable = _
-
-      override def preStart() {
-        import context.dispatcher
-        /**
-         * A message is sent to the actor itself periodically to remind the actor to resubmit failed
-         * stages.  In this way, stage resubmission can be done within the same thread context of
-         * other event processing logic to avoid unnecessary synchronization overhead.
-         */
-        resubmissionTask = context.system.scheduler.schedule(
-          RESUBMIT_TIMEOUT, RESUBMIT_TIMEOUT, self, ResubmitFailedStages)
-      }
-
       /**
        * The main event loop of the DAG scheduler.
        */
@@ -207,7 +190,6 @@ class DAGScheduler(
           if (!processEvent(event)) {
             submitWaitingStages()
           } else {
-            resubmissionTask.cancel()
             context.stop(self)
           }
       }
@@ -290,8 +272,10 @@ class DAGScheduler(
     if (mapOutputTracker.has(shuffleDep.shuffleId)) {
       val serLocs = mapOutputTracker.getSerializedMapOutputStatuses(shuffleDep.shuffleId)
       val locs = MapOutputTracker.deserializeMapStatuses(serLocs)
-      for (i <- 0 until locs.size) stage.outputLocs(i) = List(locs(i))
-      stage.numAvailableOutputs = locs.size
+      for (i <- 0 until locs.size) {
+        stage.outputLocs(i) = Option(locs(i)).toList   // locs(i) will be null if missing
+      }
+      stage.numAvailableOutputs = locs.count(_ != null)
     } else {
       // Kind of ugly: need to register RDDs with the cache and map output tracker here
       // since we can't do it in the RDD constructor because # of partitions is unknown
@@ -391,25 +375,26 @@ class DAGScheduler(
           } else {
             def removeStage(stageId: Int) {
               // data structures based on Stage
-              stageIdToStage.get(stageId).foreach { s =>
-                if (running.contains(s)) {
+              for (stage <- stageIdToStage.get(stageId)) {
+                if (running.contains(stage)) {
                   logDebug("Removing running stage %d".format(stageId))
-                  running -= s
+                  running -= stage
                 }
-                stageToInfos -= s
-                shuffleToMapStage.keys.filter(shuffleToMapStage(_) == s).foreach(shuffleId =>
-                  shuffleToMapStage.remove(shuffleId))
-                if (pendingTasks.contains(s) && !pendingTasks(s).isEmpty) {
+                stageToInfos -= stage
+                for ((k, v) <- shuffleToMapStage.find(_._2 == stage)) {
+                  shuffleToMapStage.remove(k)
+                }
+                if (pendingTasks.contains(stage) && !pendingTasks(stage).isEmpty) {
                   logDebug("Removing pending status for stage %d".format(stageId))
                 }
-                pendingTasks -= s
-                if (waiting.contains(s)) {
+                pendingTasks -= stage
+                if (waiting.contains(stage)) {
                   logDebug("Removing stage %d from waiting set.".format(stageId))
-                  waiting -= s
+                  waiting -= stage
                 }
-                if (failed.contains(s)) {
+                if (failed.contains(stage)) {
                   logDebug("Removing stage %d from failed set.".format(stageId))
-                  failed -= s
+                  failed -= stage
                 }
               }
               // data structures based on StageId
@@ -620,6 +605,8 @@ class DAGScheduler(
 
       case ResubmitFailedStages =>
         if (failed.size > 0) {
+          // Failed stages may be removed by job cancellation, so failed might be empty even if
+          // the ResubmitFailedStages event has been scheduled.
           resubmitFailedStages()
         }
 
@@ -877,7 +864,7 @@ class DAGScheduler(
               logInfo("running: " + running)
               logInfo("waiting: " + waiting)
               logInfo("failed: " + failed)
-              if (stage.shuffleDep != None) {
+              if (stage.shuffleDep.isDefined) {
                 // We supply true to increment the epoch number here in case this is a
                 // recomputation of the map outputs. In that case, some nodes may have cached
                 // locations with holes (from when we detected the error) and will need the
@@ -926,7 +913,6 @@ class DAGScheduler(
         // Mark the stage that the reducer was in as unrunnable
         val failedStage = stageIdToStage(task.stageId)
         running -= failedStage
-        failed += failedStage
         // TODO: Cancel running tasks in the stage
         logInfo("Marking " + failedStage + " (" + failedStage.name +
           ") for resubmision due to a fetch failure")
@@ -938,10 +924,16 @@ class DAGScheduler(
         }
         logInfo("The failed fetch was from " + mapStage + " (" + mapStage.name +
           "); marking it for resubmission")
+        if (failed.isEmpty && eventProcessActor != null) {
+          // Don't schedule an event to resubmit failed stages if failed isn't empty, because
+          // in that case the event will already have been scheduled. eventProcessActor may be
+          // null during unit tests.
+          import env.actorSystem.dispatcher
+          env.actorSystem.scheduler.scheduleOnce(
+            RESUBMIT_TIMEOUT, eventProcessActor, ResubmitFailedStages)
+        }
+        failed += failedStage
         failed += mapStage
-        // Remember that a fetch failed now; this is used to resubmit the broken
-        // stages later, after a small wait (to give other tasks the chance to fail)
-        lastFetchFailureTime = System.currentTimeMillis() // TODO: Use pluggable clock
         // TODO: mark the executor as failed only if there were lots of fetch failures on it
         if (bmAddress != null) {
           handleExecutorLost(bmAddress.executorId, Some(task.epoch))
@@ -954,8 +946,8 @@ class DAGScheduler(
         // Do nothing here; the TaskScheduler handles these failures and resubmits the task.
 
       case other =>
-        // Unrecognized failure - abort all jobs depending on this stage
-        abortStage(stageIdToStage(task.stageId), task + " failed: " + other)
+        // Unrecognized failure - also do nothing. If the task fails repeatedly, the TaskScheduler
+        // will abort the job.
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
index c60e9896dee4f..520c0b29e3536 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -46,7 +46,7 @@ private[spark] class Stage(
     callSite: Option[String])
   extends Logging {
 
-  val isShuffleMap = shuffleDep != None
+  val isShuffleMap = shuffleDep.isDefined
   val numPartitions = rdd.partitions.size
   val outputLocs = Array.fill[List[MapStatus]](numPartitions)(Nil)
   var numAvailableOutputs = 0
diff --git a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
index e9f2198a007e5..c4d1ad5733b4c 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/StageInfo.scala
@@ -21,6 +21,12 @@ import scala.collection._
 
 import org.apache.spark.executor.TaskMetrics
 
+/**
+ * Stores information about a stage to pass from the scheduler to SparkListeners.
+ *
+ * taskInfos stores the metrics for all tasks that have completed, including redundant, speculated
+ * tasks.
+ */
 class StageInfo(
     stage: Stage,
     val taskInfos: mutable.Buffer[(TaskInfo, TaskMetrics)] = mutable.Buffer[(TaskInfo, TaskMetrics)]()
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
index 35e9544718eb2..bdec08e968a45 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskResultGetter.scala
@@ -57,7 +57,7 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
                  * between when the task ended and when we tried to fetch the result, or if the
                  * block manager had to flush the result. */
                 scheduler.handleFailedTask(
-                  taskSetManager, tid, TaskState.FINISHED, Some(TaskResultLost))
+                  taskSetManager, tid, TaskState.FINISHED, TaskResultLost)
                 return
               }
               val deserializedResult = serializer.get().deserialize[DirectTaskResult[_]](
@@ -80,13 +80,13 @@ private[spark] class TaskResultGetter(sparkEnv: SparkEnv, scheduler: TaskSchedul
 
   def enqueueFailedTask(taskSetManager: TaskSetManager, tid: Long, taskState: TaskState,
     serializedData: ByteBuffer) {
-    var reason: Option[TaskEndReason] = None
+    var reason : TaskEndReason = UnknownReason
     getTaskResultExecutor.execute(new Runnable {
       override def run() {
         try {
           if (serializedData != null && serializedData.limit() > 0) {
-            reason = Some(serializer.get().deserialize[TaskEndReason](
-              serializedData, getClass.getClassLoader))
+            reason = serializer.get().deserialize[TaskEndReason](
+              serializedData, getClass.getClassLoader)
           }
         } catch {
           case cnd: ClassNotFoundException =>
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
index 6cc608ea5bc69..5b525155e9f62 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala
@@ -67,7 +67,6 @@ private[spark] class TaskSchedulerImpl(
 
   val taskIdToTaskSetId = new HashMap[Long, String]
   val taskIdToExecutorId = new HashMap[Long, String]
-  val taskSetTaskIds = new HashMap[String, HashSet[Long]]
 
   @volatile private var hasReceivedTask = false
   @volatile private var hasLaunchedTask = false
@@ -142,7 +141,6 @@ private[spark] class TaskSchedulerImpl(
       val manager = new TaskSetManager(this, taskSet, maxTaskFailures)
       activeTaskSets(taskSet.id) = manager
       schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties)
-      taskSetTaskIds(taskSet.id) = new HashSet[Long]()
 
       if (!isLocal && !hasReceivedTask) {
         starvationTimer.scheduleAtFixedRate(new TimerTask() {
@@ -171,31 +169,25 @@ private[spark] class TaskSchedulerImpl(
       //    the stage.
       // 2. The task set manager has been created but no tasks has been scheduled. In this case,
       //    simply abort the stage.
-      val taskIds = taskSetTaskIds(tsm.taskSet.id)
-      if (taskIds.size > 0) {
-        taskIds.foreach { tid =>
-          val execId = taskIdToExecutorId(tid)
-          backend.killTask(tid, execId)
-        }
+      tsm.runningTasksSet.foreach { tid =>
+        val execId = taskIdToExecutorId(tid)
+        backend.killTask(tid, execId)
       }
+      tsm.abort("Stage %s cancelled".format(stageId))
       logInfo("Stage %d was cancelled".format(stageId))
-      tsm.removeAllRunningTasks()
-      taskSetFinished(tsm)
     }
   }
 
+  /**
+   * Called to indicate that all task attempts (including speculated tasks) associated with the
+   * given TaskSetManager have completed, so state associated with the TaskSetManager should be
+   * cleaned up.
+   */
   def taskSetFinished(manager: TaskSetManager): Unit = synchronized {
-    // Check to see if the given task set has been removed. This is possible in the case of
-    // multiple unrecoverable task failures (e.g. if the entire task set is killed when it has
-    // more than one running tasks).
-    if (activeTaskSets.contains(manager.taskSet.id)) {
-      activeTaskSets -= manager.taskSet.id
-      manager.parent.removeSchedulable(manager)
-      logInfo("Remove TaskSet %s from pool %s".format(manager.taskSet.id, manager.parent.name))
-      taskIdToTaskSetId --= taskSetTaskIds(manager.taskSet.id)
-      taskIdToExecutorId --= taskSetTaskIds(manager.taskSet.id)
-      taskSetTaskIds.remove(manager.taskSet.id)
-    }
+    activeTaskSets -= manager.taskSet.id
+    manager.parent.removeSchedulable(manager)
+    logInfo("Removed TaskSet %s, whose tasks have all completed, from pool %s"
+      .format(manager.taskSet.id, manager.parent.name))
   }
 
   /**
@@ -237,7 +229,6 @@ private[spark] class TaskSchedulerImpl(
             tasks(i) += task
             val tid = task.taskId
             taskIdToTaskSetId(tid) = taskSet.taskSet.id
-            taskSetTaskIds(taskSet.taskSet.id) += tid
             taskIdToExecutorId(tid) = execId
             activeExecutorIds += execId
             executorsByHost(host) += execId
@@ -270,9 +261,6 @@ private[spark] class TaskSchedulerImpl(
           case Some(taskSetId) =>
             if (TaskState.isFinished(state)) {
               taskIdToTaskSetId.remove(tid)
-              if (taskSetTaskIds.contains(taskSetId)) {
-                taskSetTaskIds(taskSetId) -= tid
-              }
               taskIdToExecutorId.remove(tid)
             }
             activeTaskSets.get(taskSetId).foreach { taskSet =>
@@ -285,7 +273,9 @@ private[spark] class TaskSchedulerImpl(
               }
             }
           case None =>
-            logInfo("Ignoring update with state %s from TID %s because its task set is gone"
+            logError(
+              ("Ignoring update with state %s for TID %s because its task set is gone (this is " +
+               "likely the result of receiving duplicate task finished status updates)")
               .format(state, tid))
         }
       } catch {
@@ -293,7 +283,7 @@ private[spark] class TaskSchedulerImpl(
       }
     }
     // Update the DAGScheduler without holding a lock on this, since that can deadlock
-    if (failedExecutor != None) {
+    if (failedExecutor.isDefined) {
       dagScheduler.executorLost(failedExecutor.get)
       backend.reviveOffers()
     }
@@ -314,9 +304,9 @@ private[spark] class TaskSchedulerImpl(
     taskSetManager: TaskSetManager,
     tid: Long,
     taskState: TaskState,
-    reason: Option[TaskEndReason]) = synchronized {
+    reason: TaskEndReason) = synchronized {
     taskSetManager.handleFailedTask(tid, taskState, reason)
-    if (taskState != TaskState.KILLED) {
+    if (!taskSetManager.isZombie && taskState != TaskState.KILLED) {
       // Need to revive offers again now that the task set manager state has been updated to
       // reflect failed tasks that need to be re-run.
       backend.reviveOffers()
@@ -387,7 +377,7 @@ private[spark] class TaskSchedulerImpl(
       }
     }
     // Call dagScheduler.executorLost without holding the lock on this to prevent deadlock
-    if (failedExecutor != None) {
+    if (failedExecutor.isDefined) {
       dagScheduler.executorLost(failedExecutor.get)
       backend.reviveOffers()
     }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
index fc0ee070897dd..3f0ee7a6d48cb 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -26,9 +26,10 @@ import scala.collection.mutable.HashSet
 import scala.math.max
 import scala.math.min
 
-import org.apache.spark.{ExceptionFailure, FetchFailed, Logging, Resubmitted, SparkEnv,
-  Success, TaskEndReason, TaskKilled, TaskResultLost, TaskState}
+import org.apache.spark.{ExceptionFailure, ExecutorLostFailure, FetchFailed, Logging, Resubmitted,
+  SparkEnv, Success, TaskEndReason, TaskKilled, TaskResultLost, TaskState}
 import org.apache.spark.TaskState.TaskState
+import org.apache.spark.executor.TaskMetrics
 import org.apache.spark.util.{Clock, SystemClock}
 
 
@@ -82,8 +83,16 @@ private[spark] class TaskSetManager(
   var name = "TaskSet_"+taskSet.stageId.toString
   var parent: Pool = null
 
-  var runningTasks = 0
-  private val runningTasksSet = new HashSet[Long]
+  val runningTasksSet = new HashSet[Long]
+  override def runningTasks = runningTasksSet.size
+
+  // True once no more tasks should be launched for this task set manager. TaskSetManagers enter
+  // the zombie state once at least one attempt of each task has completed successfully, or if the
+  // task set is aborted (for example, because it was killed).  TaskSetManagers remain in the zombie
+  // state until all tasks have finished running; we keep TaskSetManagers that are in the zombie
+  // state in order to continue to track and account for the running tasks.
+  // TODO: We should kill any running task attempts when the task set manager becomes a zombie.
+  var isZombie = false
 
   // Set of pending tasks for each executor. These collections are actually
   // treated as stacks, in which new tasks are added to the end of the
@@ -233,7 +242,7 @@ private[spark] class TaskSetManager(
 
   /** Check whether a task is currently running an attempt on a given host */
   private def hasAttemptOnHost(taskIndex: Int, host: String): Boolean = {
-    !taskAttempts(taskIndex).exists(_.host == host)
+    taskAttempts(taskIndex).exists(_.host == host)
   }
 
   /**
@@ -345,7 +354,7 @@ private[spark] class TaskSetManager(
       maxLocality: TaskLocality.TaskLocality)
     : Option[TaskDescription] =
   {
-    if (tasksSuccessful < numTasks && availableCpus >= CPUS_PER_TASK) {
+    if (!isZombie && availableCpus >= CPUS_PER_TASK) {
       val curTime = clock.getTime()
 
       var allowedLocality = getAllowedLocalityLevel(curTime)
@@ -380,8 +389,7 @@ private[spark] class TaskSetManager(
           logInfo("Serialized task %s:%d as %d bytes in %d ms".format(
             taskSet.id, index, serializedTask.limit, timeTaken))
           val taskName = "task %s:%d".format(taskSet.id, index)
-          if (taskAttempts(index).size == 1)
-            taskStarted(task,info)
+          sched.dagScheduler.taskStarted(task, info)
           return Some(new TaskDescription(taskId, execId, taskName, index, serializedTask))
         }
         case _ =>
@@ -390,6 +398,12 @@ private[spark] class TaskSetManager(
     None
   }
 
+  private def maybeFinishTaskSet() {
+    if (isZombie && runningTasks == 0) {
+      sched.taskSetFinished(this)
+    }
+  }
+
   /**
    * Get the level we can launch tasks according to delay scheduling, based on current wait time.
    */
@@ -418,10 +432,6 @@ private[spark] class TaskSetManager(
     index
   }
 
-  private def taskStarted(task: Task[_], info: TaskInfo) {
-    sched.dagScheduler.taskStarted(task, info)
-  }
-
   def handleTaskGettingResult(tid: Long) = {
     val info = taskInfos(tid)
     info.markGettingResult()
@@ -436,123 +446,116 @@ private[spark] class TaskSetManager(
     val index = info.index
     info.markSuccessful()
     removeRunningTask(tid)
+    sched.dagScheduler.taskEnded(
+      tasks(index), Success, result.value, result.accumUpdates, info, result.metrics)
     if (!successful(index)) {
+      tasksSuccessful += 1
       logInfo("Finished TID %s in %d ms on %s (progress: %d/%d)".format(
         tid, info.duration, info.host, tasksSuccessful, numTasks))
-      sched.dagScheduler.taskEnded(
-        tasks(index), Success, result.value, result.accumUpdates, info, result.metrics)
-
       // Mark successful and stop if all the tasks have succeeded.
-      tasksSuccessful += 1
       successful(index) = true
       if (tasksSuccessful == numTasks) {
-        sched.taskSetFinished(this)
+        isZombie = true
       }
     } else {
       logInfo("Ignorning task-finished event for TID " + tid + " because task " +
         index + " has already completed successfully")
     }
+    maybeFinishTaskSet()
   }
 
   /**
    * Marks the task as failed, re-adds it to the list of pending tasks, and notifies the
    * DAG Scheduler.
    */
-  def handleFailedTask(tid: Long, state: TaskState, reason: Option[TaskEndReason]) {
+  def handleFailedTask(tid: Long, state: TaskState, reason: TaskEndReason) {
     val info = taskInfos(tid)
     if (info.failed) {
       return
     }
     removeRunningTask(tid)
-    val index = info.index
     info.markFailed()
-    var failureReason = "unknown"
-    if (!successful(index)) {
+    val index = info.index
+    copiesRunning(index) -= 1
+    if (!isZombie) {
       logWarning("Lost TID %s (task %s:%d)".format(tid, taskSet.id, index))
-      copiesRunning(index) -= 1
-      // Check if the problem is a map output fetch failure. In that case, this
-      // task will never succeed on any node, so tell the scheduler about it.
-      reason.foreach {
-        case fetchFailed: FetchFailed =>
-          logWarning("Loss was due to fetch failure from " + fetchFailed.bmAddress)
-          sched.dagScheduler.taskEnded(tasks(index), fetchFailed, null, null, info, null)
+    }
+    var taskMetrics : TaskMetrics = null
+    var failureReason = "unknown"
+    reason match {
+      case fetchFailed: FetchFailed =>
+        logWarning("Loss was due to fetch failure from " + fetchFailed.bmAddress)
+        if (!successful(index)) {
           successful(index) = true
           tasksSuccessful += 1
-          sched.taskSetFinished(this)
-          removeAllRunningTasks()
-          return
-
-        case TaskKilled =>
-          logWarning("Task %d was killed.".format(tid))
-          sched.dagScheduler.taskEnded(tasks(index), reason.get, null, null, info, null)
+        }
+        isZombie = true
+
+      case TaskKilled =>
+        logWarning("Task %d was killed.".format(tid))
+
+      case ef: ExceptionFailure =>
+        taskMetrics = ef.metrics.getOrElse(null)
+        if (ef.className == classOf[NotSerializableException].getName()) {
+          // If the task result wasn't serializable, there's no point in trying to re-execute it.
+          logError("Task %s:%s had a not serializable result: %s; not retrying".format(
+            taskSet.id, index, ef.description))
+          abort("Task %s:%s had a not serializable result: %s".format(
+            taskSet.id, index, ef.description))
           return
-
-        case ef: ExceptionFailure =>
-          sched.dagScheduler.taskEnded(
-            tasks(index), ef, null, null, info, ef.metrics.getOrElse(null))
-          if (ef.className == classOf[NotSerializableException].getName()) {
-            // If the task result wasn't rerializable, there's no point in trying to re-execute it.
-            logError("Task %s:%s had a not serializable result: %s; not retrying".format(
-              taskSet.id, index, ef.description))
-            abort("Task %s:%s had a not serializable result: %s".format(
-              taskSet.id, index, ef.description))
-            return
-          }
-          val key = ef.description
-          failureReason = "Exception failure: %s".format(ef.description)
-          val now = clock.getTime()
-          val (printFull, dupCount) = {
-            if (recentExceptions.contains(key)) {
-              val (dupCount, printTime) = recentExceptions(key)
-              if (now - printTime > EXCEPTION_PRINT_INTERVAL) {
-                recentExceptions(key) = (0, now)
-                (true, 0)
-              } else {
-                recentExceptions(key) = (dupCount + 1, printTime)
-                (false, dupCount + 1)
-              }
-            } else {
+        }
+        val key = ef.description
+        failureReason = "Exception failure: %s".format(ef.description)
+        val now = clock.getTime()
+        val (printFull, dupCount) = {
+          if (recentExceptions.contains(key)) {
+            val (dupCount, printTime) = recentExceptions(key)
+            if (now - printTime > EXCEPTION_PRINT_INTERVAL) {
               recentExceptions(key) = (0, now)
               (true, 0)
+            } else {
+              recentExceptions(key) = (dupCount + 1, printTime)
+              (false, dupCount + 1)
             }
-          }
-          if (printFull) {
-            val locs = ef.stackTrace.map(loc => "\tat %s".format(loc.toString))
-            logWarning("Loss was due to %s\n%s\n%s".format(
-              ef.className, ef.description, locs.mkString("\n")))
           } else {
-            logInfo("Loss was due to %s [duplicate %d]".format(ef.description, dupCount))
+            recentExceptions(key) = (0, now)
+            (true, 0)
           }
+        }
+        if (printFull) {
+          val locs = ef.stackTrace.map(loc => "\tat %s".format(loc.toString))
+          logWarning("Loss was due to %s\n%s\n%s".format(
+            ef.className, ef.description, locs.mkString("\n")))
+        } else {
+          logInfo("Loss was due to %s [duplicate %d]".format(ef.description, dupCount))
+        }
 
-        case TaskResultLost =>
-          failureReason = "Lost result for TID %s on host %s".format(tid, info.host)
-          logWarning(failureReason)
-          sched.dagScheduler.taskEnded(tasks(index), TaskResultLost, null, null, info, null)
+      case TaskResultLost =>
+        failureReason = "Lost result for TID %s on host %s".format(tid, info.host)
+        logWarning(failureReason)
 
-        case _ => {}
-      }
-      // On non-fetch failures, re-enqueue the task as pending for a max number of retries
-      addPendingTask(index)
-      if (state != TaskState.KILLED) {
-        numFailures(index) += 1
-        if (numFailures(index) >= maxTaskFailures) {
-          logError("Task %s:%d failed %d times; aborting job".format(
-            taskSet.id, index, maxTaskFailures))
-          abort("Task %s:%d failed %d times (most recent failure: %s)".format(
-            taskSet.id, index, maxTaskFailures, failureReason))
-        }
+      case _ => {}
+    }
+    sched.dagScheduler.taskEnded(tasks(index), reason, null, null, info, taskMetrics)
+    addPendingTask(index)
+    if (!isZombie && state != TaskState.KILLED) {
+      numFailures(index) += 1
+      if (numFailures(index) >= maxTaskFailures) {
+        logError("Task %s:%d failed %d times; aborting job".format(
+          taskSet.id, index, maxTaskFailures))
+        abort("Task %s:%d failed %d times (most recent failure: %s)".format(
+          taskSet.id, index, maxTaskFailures, failureReason))
+        return
       }
-    } else {
-      logInfo("Ignoring task-lost event for TID " + tid +
-        " because task " + index + " is already finished")
     }
+    maybeFinishTaskSet()
   }
 
   def abort(message: String) {
     // TODO: Kill running tasks if we were not terminated due to a Mesos error
     sched.dagScheduler.taskSetFailed(taskSet, message)
-    removeAllRunningTasks()
-    sched.taskSetFinished(this)
+    isZombie = true
+    maybeFinishTaskSet()
   }
 
   /** If the given task ID is not in the set of running tasks, adds it.
@@ -563,7 +566,6 @@ private[spark] class TaskSetManager(
     if (runningTasksSet.add(tid) && parent != null) {
       parent.increaseRunningTasks(1)
     }
-    runningTasks = runningTasksSet.size
   }
 
   /** If the given task ID is in the set of running tasks, removes it. */
@@ -571,16 +573,6 @@ private[spark] class TaskSetManager(
     if (runningTasksSet.remove(tid) && parent != null) {
       parent.decreaseRunningTasks(1)
     }
-    runningTasks = runningTasksSet.size
-  }
-
-  private[scheduler] def removeAllRunningTasks() {
-    val numRunningTasks = runningTasksSet.size
-    runningTasksSet.clear()
-    if (parent != null) {
-      parent.decreaseRunningTasks(numRunningTasks)
-    }
-    runningTasks = 0
   }
 
   override def getSchedulableByName(name: String): Schedulable = {
@@ -592,7 +584,7 @@ private[spark] class TaskSetManager(
   override def removeSchedulable(schedulable: Schedulable) {}
 
   override def getSortedTaskSetQueue(): ArrayBuffer[TaskSetManager] = {
-    var sortedTaskSetQueue = ArrayBuffer[TaskSetManager](this)
+    var sortedTaskSetQueue = new ArrayBuffer[TaskSetManager]()
     sortedTaskSetQueue += this
     sortedTaskSetQueue
   }
@@ -629,7 +621,7 @@ private[spark] class TaskSetManager(
     }
     // Also re-enqueue any tasks that were running on the node
     for ((tid, info) <- taskInfos if info.running && info.executorId == execId) {
-      handleFailedTask(tid, TaskState.KILLED, None)
+      handleFailedTask(tid, TaskState.FAILED, ExecutorLostFailure)
     }
   }
 
@@ -641,8 +633,9 @@ private[spark] class TaskSetManager(
    * we don't scan the whole task set. It might also help to make this sorted by launch time.
    */
   override def checkSpeculatableTasks(): Boolean = {
-    // Can't speculate if we only have one task, or if all tasks have finished.
-    if (numTasks == 1 || tasksSuccessful == numTasks) {
+    // Can't speculate if we only have one task, and no need to speculate if the task set is a
+    // zombie.
+    if (isZombie || numTasks == 1) {
       return false
     }
     var foundTasks = false
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
index 49781485d9f96..fef291eea0257 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/mesos/MesosSchedulerBackend.scala
@@ -130,13 +130,8 @@ private[spark] class MesosSchedulerBackend(
   private def createExecArg(): Array[Byte] = {
     if (execArgs == null) {
       val props = new HashMap[String, String]
-      val iterator = System.getProperties.entrySet.iterator
-      while (iterator.hasNext) {
-        val entry = iterator.next
-        val (key, value) = (entry.getKey.toString, entry.getValue.toString)
-        if (key.startsWith("spark.")) {
-          props(key) = value
-        }
+      for ((key,value) <- sc.conf.getAll) {
+        props(key) = value
       }
       // Serialize the map as an array of (String, String) pairs
       execArgs = Utils.serialize(props.toArray)
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerWorker.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerWorker.scala
index 42f52d7b26a04..3efe738a08f66 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerWorker.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerWorker.scala
@@ -111,7 +111,7 @@ private[spark] object BlockManagerWorker extends Logging {
     val blockMessageArray = new BlockMessageArray(blockMessage)
     val resultMessage = connectionManager.sendMessageReliablySync(
         toConnManagerId, blockMessageArray.toBufferMessage)
-    resultMessage != None
+    resultMessage.isDefined
   }
 
   def syncGetBlock(msg: GetBlock, toConnManagerId: ConnectionManagerId): ByteBuffer = {
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
index 48cec4be4111c..696b930a26b9e 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockObjectWriter.scala
@@ -66,6 +66,11 @@ private[spark] abstract class BlockObjectWriter(val blockId: BlockId) {
    * Cumulative time spent performing blocking writes, in ns.
    */
   def timeWriting(): Long
+
+  /**
+   * Number of bytes written so far
+   */
+  def bytesWritten: Long
 }
 
 /** BlockObjectWriter which writes directly to a file on disk. Appends to the given file. */
@@ -138,6 +143,7 @@ private[spark] class DiskBlockObjectWriter(
       fos = null
       ts = null
       objOut = null
+      initialized = false
     }
   }
 
@@ -145,7 +151,8 @@ private[spark] class DiskBlockObjectWriter(
 
   override def commit(): Long = {
     if (initialized) {
-      // NOTE: Flush the serializer first and then the compressed/buffered output stream
+      // NOTE: Because Kryo doesn't flush the underlying stream we explicitly flush both the
+      //       serializer stream and the lower level stream.
       objOut.flush()
       bs.flush()
       val prevPos = lastValidPosition
@@ -175,14 +182,14 @@ private[spark] class DiskBlockObjectWriter(
   }
 
   override def fileSegment(): FileSegment = {
-    val bytesWritten = lastValidPosition - initialPosition
     new FileSegment(file, initialPosition, bytesWritten)
   }
 
   // Only valid if called after close()
   override def timeWriting() = _timeWriting
 
-  def bytesWritten: Long = {
+  // Only valid if called after commit()
+  override def bytesWritten: Long = {
     lastValidPosition - initialPosition
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
index a8ef7fa8b63eb..f3e1c38744d78 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockManager.scala
@@ -50,7 +50,7 @@ private[spark] class DiskBlockManager(shuffleManager: ShuffleBlockManager, rootD
   addShutdownHook()
 
   /**
-   * Returns the phyiscal file segment in which the given BlockId is located.
+   * Returns the physical file segment in which the given BlockId is located.
    * If the BlockId has been mapped to a specific FileSegment, that will be returned.
    * Otherwise, we assume the Block is mapped to a whole file identified by the BlockId directly.
    */
diff --git a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
index 27f057b9f22f4..eb5a18521683e 100644
--- a/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
+++ b/core/src/main/scala/org/apache/spark/storage/MemoryStore.scala
@@ -214,7 +214,7 @@ private class MemoryStore(blockManager: BlockManager, maxMemory: Long)
         while (maxMemory - (currentMemory - selectedMemory) < space && iterator.hasNext) {
           val pair = iterator.next()
           val blockId = pair.getKey
-          if (rddToAdd != None && rddToAdd == getRddId(blockId)) {
+          if (rddToAdd.isDefined && rddToAdd == getRddId(blockId)) {
             logInfo("Will not store " + blockIdToAdd + " as it would require dropping another " +
               "block from the same RDD")
             return false
diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockManager.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockManager.scala
index e2b24298a55e8..bb07c8cb134cc 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockManager.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockManager.scala
@@ -23,10 +23,11 @@ import java.util.concurrent.atomic.AtomicInteger
 
 import scala.collection.JavaConversions._
 
+import org.apache.spark.Logging
 import org.apache.spark.serializer.Serializer
-import org.apache.spark.util.{MetadataCleanerType, MetadataCleaner, TimeStampedHashMap}
-import org.apache.spark.util.collection.{PrimitiveKeyOpenHashMap, PrimitiveVector}
 import org.apache.spark.storage.ShuffleBlockManager.ShuffleFileGroup
+import org.apache.spark.util.{MetadataCleaner, MetadataCleanerType, TimeStampedHashMap}
+import org.apache.spark.util.collection.{PrimitiveKeyOpenHashMap, PrimitiveVector}
 
 /** A group of writers for a ShuffleMapTask, one writer per reducer. */
 private[spark] trait ShuffleWriterGroup {
@@ -58,7 +59,7 @@ private[spark] trait ShuffleWriterGroup {
  * files within a ShuffleFileGroups associated with the block's reducer.
  */
 private[spark]
-class ShuffleBlockManager(blockManager: BlockManager) {
+class ShuffleBlockManager(blockManager: BlockManager) extends Logging {
   def conf = blockManager.conf
 
   // Turning off shuffle file consolidation causes all shuffle Blocks to get their own file.
@@ -106,6 +107,15 @@ class ShuffleBlockManager(blockManager: BlockManager) {
         Array.tabulate[BlockObjectWriter](numBuckets) { bucketId =>
           val blockId = ShuffleBlockId(shuffleId, mapId, bucketId)
           val blockFile = blockManager.diskBlockManager.getFile(blockId)
+          // Because of previous failures, the shuffle file may already exist on this machine.
+          // If so, remove it.
+          if (blockFile.exists) {
+            if (blockFile.delete()) {
+              logInfo(s"Removed existing shuffle file $blockFile")
+            } else {
+              logWarning(s"Failed to remove existing shuffle file $blockFile")
+            }
+          }
           blockManager.getDiskWriter(blockId, blockFile, serializer, bufferSize)
         }
       }
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index caa9bf4c9280e..8447773343d25 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -26,7 +26,7 @@ import scala.collection.JavaConversions._
 import scala.collection.Map
 import scala.collection.mutable.ArrayBuffer
 import scala.io.Source
-import scala.reflect.{classTag, ClassTag}
+import scala.reflect.ClassTag
 
 import com.google.common.io.Files
 import com.google.common.util.concurrent.ThreadFactoryBuilder
@@ -46,27 +46,6 @@ import org.apache.spark.{SparkConf, SparkException, Logging}
  */
 private[spark] object Utils extends Logging {
 
-  /**
-   *  We try to clone for most common types of writables and we call WritableUtils.clone otherwise
-   *  intention is to optimize, for example for NullWritable there is no need and for Long, int and
-   *  String creating a new object with value set would be faster.
-   */
-  def cloneWritables[T: ClassTag](conf: Configuration): Writable => T = {
-    val cloneFunc = classTag[T] match {
-      case ClassTag(_: Text) => 
-        (w: Writable) => new Text(w.asInstanceOf[Text].getBytes).asInstanceOf[T]
-      case ClassTag(_: LongWritable) => 
-        (w: Writable) => new LongWritable(w.asInstanceOf[LongWritable].get).asInstanceOf[T]
-      case ClassTag(_: IntWritable) => 
-        (w: Writable) => new IntWritable(w.asInstanceOf[IntWritable].get).asInstanceOf[T]
-      case ClassTag(_: NullWritable) => 
-        (w: Writable) => w.asInstanceOf[T] // TODO: should we clone this ?
-      case _ => 
-        (w: Writable) => WritableUtils.clone(w, conf).asInstanceOf[T] // slower way of cloning.
-    }
-    cloneFunc
-  }
-
   /** Serialize an object using Java serialization */
   def serialize[T](o: T): Array[Byte] = {
     val bos = new ByteArrayOutputStream()
@@ -673,7 +652,7 @@ private[spark] object Utils extends Logging {
 
     for (el <- trace) {
       if (!finished) {
-        if (SPARK_CLASS_REGEX.findFirstIn(el.getClassName) != None) {
+        if (SPARK_CLASS_REGEX.findFirstIn(el.getClassName).isDefined) {
           lastSparkMethod = if (el.getMethodName == "<init>") {
             // Spark method is a constructor; get its class name
             el.getClassName.substring(el.getClassName.lastIndexOf('.') + 1)
diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
index 64e9b436f04a2..6f368179bbfbd 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalAppendOnlyMap.scala
@@ -20,14 +20,15 @@ package org.apache.spark.util.collection
 import java.io._
 import java.util.Comparator
 
-import it.unimi.dsi.fastutil.io.FastBufferedInputStream
-
 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
+import it.unimi.dsi.fastutil.io.FastBufferedInputStream
+import com.google.common.io.ByteStreams
+
 import org.apache.spark.{Logging, SparkEnv}
-import org.apache.spark.serializer.{KryoDeserializationStream, KryoSerializationStream, Serializer}
-import org.apache.spark.storage.{BlockId, BlockManager, DiskBlockManager, DiskBlockObjectWriter}
+import org.apache.spark.serializer.Serializer
+import org.apache.spark.storage.{BlockId, BlockManager}
 
 /**
  * An append-only map that spills sorted content to disk when there is insufficient space for it
@@ -83,12 +84,15 @@ private[spark] class ExternalAppendOnlyMap[K, V, C](
   // Number of in-memory pairs inserted before tracking the map's shuffle memory usage
   private val trackMemoryThreshold = 1000
 
-  // Size of object batches when reading/writing from serializers. Objects are written in
-  // batches, with each batch using its own serialization stream. This cuts down on the size
-  // of reference-tracking maps constructed when deserializing a stream.
-  //
-  // NOTE: Setting this too low can cause excess copying when serializing, since some serializers
-  // grow internal data structures by growing + copying every time the number of objects doubles.
+  /**
+   * Size of object batches when reading/writing from serializers.
+   *
+   * Objects are written in batches, with each batch using its own serialization stream. This
+   * cuts down on the size of reference-tracking maps constructed when deserializing a stream.
+   *
+   * NOTE: Setting this too low can cause excessive copying when serializing, since some serializers
+   * grow internal data structures by growing + copying every time the number of objects doubles.
+   */
   private val serializerBatchSize = sparkConf.getLong("spark.shuffle.spill.batchSize", 10000)
 
   // How many times we have spilled so far
@@ -99,7 +103,6 @@ private[spark] class ExternalAppendOnlyMap[K, V, C](
   private var _diskBytesSpilled = 0L
 
   private val fileBufferSize = sparkConf.getInt("spark.shuffle.file.buffer.kb", 100) * 1024
-  private val syncWrites = sparkConf.getBoolean("spark.shuffle.sync", false)
   private val comparator = new KCComparator[K, C]
   private val ser = serializer.newInstance()
 
@@ -145,20 +148,28 @@ private[spark] class ExternalAppendOnlyMap[K, V, C](
   }
 
   /**
-   * Sort the existing contents of the in-memory map and spill them to a temporary file on disk
+   * Sort the existing contents of the in-memory map and spill them to a temporary file on disk.
    */
   private def spill(mapSize: Long) {
     spillCount += 1
     logWarning("Spilling in-memory map of %d MB to disk (%d time%s so far)"
       .format(mapSize / (1024 * 1024), spillCount, if (spillCount > 1) "s" else ""))
     val (blockId, file) = diskBlockManager.createTempBlock()
+    var writer = blockManager.getDiskWriter(blockId, file, serializer, fileBufferSize)
+    var objectsWritten = 0
 
-    val compressStream: OutputStream => OutputStream = blockManager.wrapForCompression(blockId, _)
-    def getNewWriter = new DiskBlockObjectWriter(blockId, file, serializer, fileBufferSize,
-      compressStream, syncWrites)
+    // List of batch sizes (bytes) in the order they are written to disk
+    val batchSizes = new ArrayBuffer[Long]
+
+    // Flush the disk writer's contents to disk, and update relevant variables
+    def flush() = {
+      writer.commit()
+      val bytesWritten = writer.bytesWritten
+      batchSizes.append(bytesWritten)
+      _diskBytesSpilled += bytesWritten
+      objectsWritten = 0
+    }
 
-    var writer = getNewWriter
-    var objectsWritten = 0
     try {
       val it = currentMap.destructiveSortedIterator(comparator)
       while (it.hasNext) {
@@ -167,20 +178,21 @@ private[spark] class ExternalAppendOnlyMap[K, V, C](
         objectsWritten += 1
 
         if (objectsWritten == serializerBatchSize) {
-          writer.commit()
-          writer = getNewWriter
-          objectsWritten = 0
+          flush()
+          writer.close()
+          writer = blockManager.getDiskWriter(blockId, file, serializer, fileBufferSize)
         }
       }
-
-      if (objectsWritten > 0) writer.commit()
+      if (objectsWritten > 0) {
+        flush()
+      }
     } finally {
       // Partial failures cannot be tolerated; do not revert partial writes
-      _diskBytesSpilled += writer.bytesWritten
       writer.close()
     }
+
     currentMap = new SizeTrackingAppendOnlyMap[K, C]
-    spilledMaps.append(new DiskMapIterator(file, blockId))
+    spilledMaps.append(new DiskMapIterator(file, blockId, batchSizes))
 
     // Reset the amount of shuffle memory used by this map in the global pool
     val shuffleMemoryMap = SparkEnv.get.shuffleMemoryMap
@@ -211,25 +223,29 @@ private[spark] class ExternalAppendOnlyMap[K, V, C](
    */
   private class ExternalIterator extends Iterator[(K, C)] {
 
-    // A fixed-size queue that maintains a buffer for each stream we are currently merging
-    val mergeHeap = new mutable.PriorityQueue[StreamBuffer]
+    // A queue that maintains a buffer for each stream we are currently merging
+    // This queue maintains the invariant that it only contains non-empty buffers
+    private val mergeHeap = new mutable.PriorityQueue[StreamBuffer]
 
     // Input streams are derived both from the in-memory map and spilled maps on disk
     // The in-memory map is sorted in place, while the spilled maps are already in sorted order
-    val sortedMap = currentMap.destructiveSortedIterator(comparator)
-    val inputStreams = Seq(sortedMap) ++ spilledMaps
+    private val sortedMap = currentMap.destructiveSortedIterator(comparator)
+    private val inputStreams = Seq(sortedMap) ++ spilledMaps
 
     inputStreams.foreach { it =>
       val kcPairs = getMorePairs(it)
-      mergeHeap.enqueue(StreamBuffer(it, kcPairs))
+      if (kcPairs.length > 0) {
+        mergeHeap.enqueue(new StreamBuffer(it, kcPairs))
+      }
     }
 
     /**
-     * Fetch from the given iterator until a key of different hash is retrieved. In the
-     * event of key hash collisions, this ensures no pairs are hidden from being merged.
+     * Fetch from the given iterator until a key of different hash is retrieved.
+     *
+     * In the event of key hash collisions, this ensures no pairs are hidden from being merged.
      * Assume the given iterator is in sorted order.
      */
-    def getMorePairs(it: Iterator[(K, C)]): ArrayBuffer[(K, C)] = {
+    private def getMorePairs(it: Iterator[(K, C)]): ArrayBuffer[(K, C)] = {
       val kcPairs = new ArrayBuffer[(K, C)]
       if (it.hasNext) {
         var kc = it.next()
@@ -245,11 +261,11 @@ private[spark] class ExternalAppendOnlyMap[K, V, C](
 
     /**
      * If the given buffer contains a value for the given key, merge that value into
-     * baseCombiner and remove the corresponding (K, C) pair from the buffer
+     * baseCombiner and remove the corresponding (K, C) pair from the buffer.
      */
-    def mergeIfKeyExists(key: K, baseCombiner: C, buffer: StreamBuffer): C = {
+    private def mergeIfKeyExists(key: K, baseCombiner: C, buffer: StreamBuffer): C = {
       var i = 0
-      while (i < buffer.pairs.size) {
+      while (i < buffer.pairs.length) {
         val (k, c) = buffer.pairs(i)
         if (k == key) {
           buffer.pairs.remove(i)
@@ -261,39 +277,41 @@ private[spark] class ExternalAppendOnlyMap[K, V, C](
     }
 
     /**
-     * Return true if there exists an input stream that still has unvisited pairs
+     * Return true if there exists an input stream that still has unvisited pairs.
      */
-    override def hasNext: Boolean = mergeHeap.exists(!_.pairs.isEmpty)
+    override def hasNext: Boolean = mergeHeap.length > 0
 
     /**
-     * Select a key with the minimum hash, then combine all values with the same key from all input streams.
+     * Select a key with the minimum hash, then combine all values with the same key from all
+     * input streams
      */
     override def next(): (K, C) = {
+      if (mergeHeap.length == 0) {
+        throw new NoSuchElementException
+      }
       // Select a key from the StreamBuffer that holds the lowest key hash
       val minBuffer = mergeHeap.dequeue()
       val (minPairs, minHash) = (minBuffer.pairs, minBuffer.minKeyHash)
-      if (minPairs.length == 0) {
-        // Should only happen when no other stream buffers have any pairs left
-        throw new NoSuchElementException
-      }
       var (minKey, minCombiner) = minPairs.remove(0)
       assert(minKey.hashCode() == minHash)
 
       // For all other streams that may have this key (i.e. have the same minimum key hash),
       // merge in the corresponding value (if any) from that stream
       val mergedBuffers = ArrayBuffer[StreamBuffer](minBuffer)
-      while (!mergeHeap.isEmpty && mergeHeap.head.minKeyHash == minHash) {
+      while (mergeHeap.length > 0 && mergeHeap.head.minKeyHash == minHash) {
         val newBuffer = mergeHeap.dequeue()
         minCombiner = mergeIfKeyExists(minKey, minCombiner, newBuffer)
         mergedBuffers += newBuffer
       }
 
-      // Repopulate each visited stream buffer and add it back to the merge heap
+      // Repopulate each visited stream buffer and add it back to the queue if it is non-empty
       mergedBuffers.foreach { buffer =>
-        if (buffer.pairs.length == 0) {
+        if (buffer.isEmpty) {
           buffer.pairs ++= getMorePairs(buffer.iterator)
         }
-        mergeHeap.enqueue(buffer)
+        if (!buffer.isEmpty) {
+          mergeHeap.enqueue(buffer)
+        }
       }
 
       (minKey, minCombiner)
@@ -306,16 +324,15 @@ private[spark] class ExternalAppendOnlyMap[K, V, C](
      *
      * StreamBuffers are ordered by the minimum key hash found across all of their own pairs.
      */
-    case class StreamBuffer(iterator: Iterator[(K, C)], pairs: ArrayBuffer[(K, C)])
+    private case class StreamBuffer(iterator: Iterator[(K, C)], pairs: ArrayBuffer[(K, C)])
       extends Comparable[StreamBuffer] {
 
-      def minKeyHash: Int = {
-        if (pairs.length > 0){
-          // pairs are already sorted by key hash
-          pairs(0)._1.hashCode()
-        } else {
-          Int.MaxValue
-        }
+      def isEmpty = pairs.length == 0
+
+      // Invalid if there are no more pairs in this stream
+      def minKeyHash = {
+        assert(pairs.length > 0)
+        pairs.head._1.hashCode()
       }
 
       override def compareTo(other: StreamBuffer): Int = {
@@ -328,43 +345,53 @@ private[spark] class ExternalAppendOnlyMap[K, V, C](
   /**
    * An iterator that returns (K, C) pairs in sorted order from an on-disk map
    */
-  private class DiskMapIterator(file: File, blockId: BlockId) extends Iterator[(K, C)] {
-    val fileStream = new FileInputStream(file)
-    val bufferedStream = new FastBufferedInputStream(fileStream, fileBufferSize)
-    val compressedStream = blockManager.wrapForCompression(blockId, bufferedStream)
-    var deserializeStream = ser.deserializeStream(compressedStream)
-    var objectsRead = 0
-
-    var nextItem: (K, C) = null
-    var eof = false
-
-    def readNextItem(): (K, C) = {
-      if (!eof) {
-        try {
-          if (objectsRead == serializerBatchSize) {
-            val newInputStream = deserializeStream match {
-              case stream: KryoDeserializationStream =>
-                // Kryo's serializer stores an internal buffer that pre-fetches from the underlying
-                // stream. We need to capture this buffer and feed it to the new serialization
-                // stream so that the bytes are not lost.
-                val kryoInput = stream.input
-                val remainingBytes = kryoInput.limit() - kryoInput.position()
-                val extraBuf = kryoInput.readBytes(remainingBytes)
-                new SequenceInputStream(new ByteArrayInputStream(extraBuf), compressedStream)
-              case _ => compressedStream
-            }
-            deserializeStream = ser.deserializeStream(newInputStream)
-            objectsRead = 0
-          }
-          objectsRead += 1
-          return deserializeStream.readObject().asInstanceOf[(K, C)]
-        } catch {
-          case e: EOFException =>
-            eof = true
-            cleanup()
+  private class DiskMapIterator(file: File, blockId: BlockId, batchSizes: ArrayBuffer[Long])
+    extends Iterator[(K, C)] {
+    private val fileStream = new FileInputStream(file)
+    private val bufferedStream = new FastBufferedInputStream(fileStream, fileBufferSize)
+
+    // An intermediate stream that reads from exactly one batch
+    // This guards against pre-fetching and other arbitrary behavior of higher level streams
+    private var batchStream = nextBatchStream()
+    private var compressedStream = blockManager.wrapForCompression(blockId, batchStream)
+    private var deserializeStream = ser.deserializeStream(compressedStream)
+    private var nextItem: (K, C) = null
+    private var objectsRead = 0
+
+    /**
+     * Construct a stream that reads only from the next batch.
+     */
+    private def nextBatchStream(): InputStream = {
+      if (batchSizes.length > 0) {
+        ByteStreams.limit(bufferedStream, batchSizes.remove(0))
+      } else {
+        // No more batches left
+        bufferedStream
+      }
+    }
+
+    /**
+     * Return the next (K, C) pair from the deserialization stream.
+     *
+     * If the current batch is drained, construct a stream for the next batch and read from it.
+     * If no more pairs are left, return null.
+     */
+    private def readNextItem(): (K, C) = {
+      try {
+        val item = deserializeStream.readObject().asInstanceOf[(K, C)]
+        objectsRead += 1
+        if (objectsRead == serializerBatchSize) {
+          batchStream = nextBatchStream()
+          compressedStream = blockManager.wrapForCompression(blockId, batchStream)
+          deserializeStream = ser.deserializeStream(compressedStream)
+          objectsRead = 0
         }
+        item
+      } catch {
+        case e: EOFException =>
+          cleanup()
+          null
       }
-      null
     }
 
     override def hasNext: Boolean = {
@@ -384,7 +411,7 @@ private[spark] class ExternalAppendOnlyMap[K, V, C](
     }
 
     // TODO: Ensure this gets called even if the iterator isn't drained.
-    def cleanup() {
+    private def cleanup() {
       deserializeStream.close()
       file.delete()
     }
diff --git a/core/src/test/resources/spark.conf b/core/src/test/resources/spark.conf
deleted file mode 100644
index aa4e7512354d3..0000000000000
--- a/core/src/test/resources/spark.conf
+++ /dev/null
@@ -1,8 +0,0 @@
-# A simple spark.conf file used only in our unit tests
-
-spark.test.intTestProperty = 1
-
-spark.test {
-  stringTestProperty = "hi"
-  listTestProperty = ["a", "b"]
-}
diff --git a/core/src/test/scala/org/apache/spark/DistributedSuite.scala b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
index d9cb7fead5b88..8de7a328d1cf5 100644
--- a/core/src/test/scala/org/apache/spark/DistributedSuite.scala
+++ b/core/src/test/scala/org/apache/spark/DistributedSuite.scala
@@ -125,6 +125,23 @@ class DistributedSuite extends FunSuite with ShouldMatchers with BeforeAndAfter
     assert(thrown.getMessage.contains("failed 4 times"))
   }
 
+  test("repeatedly failing task that crashes JVM") {
+    // Ensures that if a task fails in a way that crashes the JVM, the job eventually fails rather
+    // than hanging due to retrying the failed task infinitely many times (eventually the
+    // standalone scheduler will remove the application, causing the job to hang waiting to
+    // reconnect to the master).
+    sc = new SparkContext(clusterUrl, "test")
+    failAfter(Span(100000, Millis)) {
+      val thrown = intercept[SparkException] {
+        // One of the tasks always fails.
+        sc.parallelize(1 to 10, 2).foreach { x => if (x == 1) System.exit(42) }
+      }
+      assert(thrown.getClass === classOf[SparkException])
+      System.out.println(thrown.getMessage)
+      assert(thrown.getMessage.contains("failed 4 times"))
+    }
+  }
+
   test("caching") {
     sc = new SparkContext(clusterUrl, "test")
     val data = sc.parallelize(1 to 1000, 10).cache()
diff --git a/core/src/test/scala/org/apache/spark/FailureSuite.scala b/core/src/test/scala/org/apache/spark/FailureSuite.scala
index befdc1589f009..6465a80e4c8ba 100644
--- a/core/src/test/scala/org/apache/spark/FailureSuite.scala
+++ b/core/src/test/scala/org/apache/spark/FailureSuite.scala
@@ -81,6 +81,19 @@ class FailureSuite extends FunSuite with LocalSparkContext {
     FailureSuiteState.clear()
   }
 
+  // Run a map-reduce job in which the map stage always fails.
+  test("failure in a map stage") {
+    sc = new SparkContext("local", "test")
+    val data = sc.makeRDD(1 to 3).map(x => { throw new Exception; (x, x) }).groupByKey(3)
+    intercept[SparkException] {
+      data.collect()
+    }
+    // Make sure that running new jobs with the same map stage also fails
+    intercept[SparkException] {
+      data.collect()
+    }
+  }
+
   test("failure because task results are not serializable") {
     sc = new SparkContext("local[1,1]", "test")
     val results = sc.makeRDD(1 to 3).map(x => new NonSerializable)
diff --git a/core/src/test/scala/org/apache/spark/JavaAPISuite.java b/core/src/test/scala/org/apache/spark/JavaAPISuite.java
index 23ec6c3b311f0..8c573ac0d65e0 100644
--- a/core/src/test/scala/org/apache/spark/JavaAPISuite.java
+++ b/core/src/test/scala/org/apache/spark/JavaAPISuite.java
@@ -387,18 +387,21 @@ public Double call(Integer x) {
         return 1.0 * x;
       }
     }).cache();
+    doubles.collect();
     JavaPairRDD<Integer, Integer> pairs = rdd.map(new PairFunction<Integer, Integer, Integer>() {
       @Override
       public Tuple2<Integer, Integer> call(Integer x) {
         return new Tuple2<Integer, Integer>(x, x);
       }
     }).cache();
+    pairs.collect();
     JavaRDD<String> strings = rdd.map(new Function<Integer, String>() {
       @Override
       public String call(Integer x) {
         return x.toString();
       }
     }).cache();
+    strings.collect();
   }
 
   @Test
@@ -962,4 +965,18 @@ public void countApproxDistinctByKey() {
     }
 
   }
+
+  @Test
+  public void collectAsMapWithIntArrayValues() {
+    // Regression test for SPARK-1040
+    JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(new Integer[] { 1 }));
+    JavaPairRDD<Integer, int[]> pairRDD = rdd.map(new PairFunction<Integer, Integer, int[]>() {
+      @Override
+      public Tuple2<Integer, int[]> call(Integer x) throws Exception {
+        return new Tuple2<Integer, int[]>(x, new int[] { x });
+      }
+    });
+    pairRDD.collect();  // Works fine
+    Map<Integer, int[]> map = pairRDD.collectAsMap();  // Used to crash with ClassCastException
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
index ef5936dd2f588..87e9012622456 100644
--- a/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkConfSuite.scala
@@ -1,37 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark
 
 import org.scalatest.FunSuite
 
 class SparkConfSuite extends FunSuite with LocalSparkContext {
-  // This test uses the spark.conf in core/src/test/resources, which has a few test properties
-  test("loading from spark.conf") {
-    val conf = new SparkConf()
-    assert(conf.get("spark.test.intTestProperty") === "1")
-    assert(conf.get("spark.test.stringTestProperty") === "hi")
-    // NOTE: we don't use list properties yet, but when we do, we'll have to deal with this syntax
-    assert(conf.get("spark.test.listTestProperty") === "[a, b]")
-  }
-
-  // This test uses the spark.conf in core/src/test/resources, which has a few test properties
-  test("system properties override spark.conf") {
+  test("loading from system properties") {
     try {
-      System.setProperty("spark.test.intTestProperty", "2")
+      System.setProperty("spark.test.testProperty", "2")
       val conf = new SparkConf()
-      assert(conf.get("spark.test.intTestProperty") === "2")
-      assert(conf.get("spark.test.stringTestProperty") === "hi")
+      assert(conf.get("spark.test.testProperty") === "2")
     } finally {
-      System.clearProperty("spark.test.intTestProperty")
+      System.clearProperty("spark.test.testProperty")
     }
   }
 
   test("initializing without loading defaults") {
     try {
-      System.setProperty("spark.test.intTestProperty", "2")
+      System.setProperty("spark.test.testProperty", "2")
       val conf = new SparkConf(false)
-      assert(!conf.contains("spark.test.intTestProperty"))
-      assert(!conf.contains("spark.test.stringTestProperty"))
+      assert(!conf.contains("spark.test.testProperty"))
     } finally {
-      System.clearProperty("spark.test.intTestProperty")
+      System.clearProperty("spark.test.testProperty")
     }
   }
 
@@ -107,4 +112,25 @@ class SparkConfSuite extends FunSuite with LocalSparkContext {
     assert(sc.master === "local[2]")
     assert(sc.appName === "My other app")
   }
+
+  test("nested property names") {
+    // This wasn't supported by some external conf parsing libraries
+    try {
+      System.setProperty("spark.test.a", "a")
+      System.setProperty("spark.test.a.b", "a.b")
+      System.setProperty("spark.test.a.b.c", "a.b.c")
+      val conf = new SparkConf()
+      assert(conf.get("spark.test.a") === "a")
+      assert(conf.get("spark.test.a.b") === "a.b")
+      assert(conf.get("spark.test.a.b.c") === "a.b.c")
+      conf.set("spark.test.a.b", "A.B")
+      assert(conf.get("spark.test.a") === "a")
+      assert(conf.get("spark.test.a.b") === "A.B")
+      assert(conf.get("spark.test.a.b.c") === "a.b.c")
+    } finally {
+      System.clearProperty("spark.test.a")
+      System.clearProperty("spark.test.a.b")
+      System.clearProperty("spark.test.a.b.c")
+    }
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
new file mode 100644
index 0000000000000..1bebfe5ec84ec
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/api/python/PythonRDDSuite.scala
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.python
+
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+import org.apache.spark.api.python.PythonRDD
+
+import java.io.{ByteArrayOutputStream, DataOutputStream}
+
+class PythonRDDSuite extends FunSuite {
+
+    test("Writing large strings to the worker") {
+        val input: List[String] = List("a"*100000)
+        val buffer = new DataOutputStream(new ByteArrayOutputStream)
+        PythonRDD.writeIteratorToStream(input.iterator, buffer)
+    }
+
+}
+
diff --git a/core/src/test/scala/org/apache/spark/deploy/ClientSuite.scala b/core/src/test/scala/org/apache/spark/deploy/ClientSuite.scala
new file mode 100644
index 0000000000000..d6b93f5fedd3b
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/deploy/ClientSuite.scala
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.deploy
+
+import org.scalatest.FunSuite
+import org.scalatest.matchers.ShouldMatchers
+
+class ClientSuite extends FunSuite with ShouldMatchers {
+  test("correctly validates driver jar URL's") {
+    ClientArguments.isValidJarUrl("http://someHost:8080/foo.jar") should be (true)
+    ClientArguments.isValidJarUrl("file://some/path/to/a/jarFile.jar") should be (true)
+    ClientArguments.isValidJarUrl("hdfs://someHost:1234/foo.jar") should be (true)
+
+    ClientArguments.isValidJarUrl("hdfs://someHost:1234/foo") should be (false)
+    ClientArguments.isValidJarUrl("/missing/a/protocol/jarfile.jar") should be (false)
+    ClientArguments.isValidJarUrl("not-even-a-path.jar") should be (false)
+  }
+
+}
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala b/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala
index 45dbcaffae94f..0c502612647a2 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/DriverRunnerTest.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.deploy.worker
 
 import java.io.File
diff --git a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala
index 94d88d307a163..1f1d8d138005b 100644
--- a/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/worker/WorkerWatcherSuite.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.deploy.worker
 
 
diff --git a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
index 5da538a1ddfd5..9c78630944277 100644
--- a/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/rdd/PairRDDFunctionsSuite.scala
@@ -24,6 +24,9 @@ import scala.util.Random
 import org.scalatest.FunSuite
 
 import com.google.common.io.Files
+import org.apache.hadoop.mapreduce._
+import org.apache.hadoop.conf.{Configuration, Configurable}
+
 import org.apache.spark.SparkContext._
 import org.apache.spark.{Partitioner, SharedSparkContext}
 
@@ -331,4 +334,77 @@ class PairRDDFunctionsSuite extends FunSuite with SharedSparkContext {
       (1, ArrayBuffer(1)),
       (2, ArrayBuffer(1))))
   }
+
+  test("saveNewAPIHadoopFile should call setConf if format is configurable") {
+    val pairs = sc.parallelize(Array((new Integer(1), new Integer(1))))
+
+    // No error, non-configurable formats still work
+    pairs.saveAsNewAPIHadoopFile[FakeFormat]("ignored")
+
+    /*
+      Check that configurable formats get configured:
+      ConfigTestFormat throws an exception if we try to write
+      to it when setConf hasn't been called first.
+      Assertion is in ConfigTestFormat.getRecordWriter.
+     */
+    pairs.saveAsNewAPIHadoopFile[ConfigTestFormat]("ignored")
+  }
+}
+
+/*
+  These classes are fakes for testing
+    "saveNewAPIHadoopFile should call setConf if format is configurable".
+  Unfortunately, they have to be top level classes, and not defined in
+  the test method, because otherwise Scala won't generate no-args constructors
+  and the test will therefore throw InstantiationException when saveAsNewAPIHadoopFile
+  tries to instantiate them with Class.newInstance.
+ */
+class FakeWriter extends RecordWriter[Integer, Integer] {
+
+  def close(p1: TaskAttemptContext) = ()
+
+  def write(p1: Integer, p2: Integer) = ()
+
+}
+
+class FakeCommitter extends OutputCommitter {
+  def setupJob(p1: JobContext) = ()
+
+  def needsTaskCommit(p1: TaskAttemptContext): Boolean = false
+
+  def setupTask(p1: TaskAttemptContext) = ()
+
+  def commitTask(p1: TaskAttemptContext) = ()
+
+  def abortTask(p1: TaskAttemptContext) = ()
+}
+
+class FakeFormat() extends OutputFormat[Integer, Integer]() {
+
+  def checkOutputSpecs(p1: JobContext)  = ()
+
+  def getRecordWriter(p1: TaskAttemptContext): RecordWriter[Integer, Integer] = {
+    new FakeWriter()
+  }
+
+  def getOutputCommitter(p1: TaskAttemptContext): OutputCommitter = {
+    new FakeCommitter()
+  }
 }
+
+class ConfigTestFormat() extends FakeFormat() with Configurable {
+
+  var setConfCalled = false
+  def setConf(p1: Configuration) = {
+    setConfCalled = true
+    ()
+  }
+
+  def getConf: Configuration = null
+
+  override def getRecordWriter(p1: TaskAttemptContext): RecordWriter[Integer, Integer] = {
+    assert(setConfCalled, "setConf was never called")
+    super.getRecordWriter(p1)
+  }
+}
+
diff --git a/core/src/test/scala/org/apache/spark/scheduler/ClusterSchedulerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/ClusterSchedulerSuite.scala
index 235d31709af2b..98ea4cb5612ec 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/ClusterSchedulerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/ClusterSchedulerSuite.scala
@@ -36,22 +36,24 @@ class FakeTaskSetManager(
   parent = null
   weight = 1
   minShare = 2
-  runningTasks = 0
   priority = initPriority
   stageId = initStageId
   name = "TaskSet_"+stageId
   override val numTasks = initNumTasks
   tasksSuccessful = 0
 
+  var numRunningTasks = 0
+  override def runningTasks = numRunningTasks
+
   def increaseRunningTasks(taskNum: Int) {
-    runningTasks += taskNum
+    numRunningTasks += taskNum
     if (parent != null) {
       parent.increaseRunningTasks(taskNum)
     }
   }
 
   def decreaseRunningTasks(taskNum: Int) {
-    runningTasks -= taskNum
+    numRunningTasks -= taskNum
     if (parent != null) {
       parent.decreaseRunningTasks(taskNum)
     }
@@ -77,7 +79,7 @@ class FakeTaskSetManager(
       maxLocality: TaskLocality.TaskLocality)
     : Option[TaskDescription] =
   {
-    if (tasksSuccessful + runningTasks < numTasks) {
+    if (tasksSuccessful + numRunningTasks < numTasks) {
       increaseRunningTasks(1)
       Some(new TaskDescription(0, execId, "task 0:0", 0, null))
     } else {
@@ -98,7 +100,7 @@ class FakeTaskSetManager(
   }
 
   def abort() {
-    decreaseRunningTasks(runningTasks)
+    decreaseRunningTasks(numRunningTasks)
     parent.removeSchedulable(this)
   }
 }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
index 1a16e438c43d8..368c5154ea3b9 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SparkListenerSuite.scala
@@ -168,6 +168,39 @@ class SparkListenerSuite extends FunSuite with LocalSparkContext with ShouldMatc
     assert(listener.endedTasks.contains(TASK_INDEX))
   }
 
+  test("onTaskEnd() should be called for all started tasks, even after job has been killed") {
+    val WAIT_TIMEOUT_MILLIS = 10000
+    val listener = new SaveTaskEvents
+    sc.addSparkListener(listener)
+
+    val numTasks = 10
+    val f = sc.parallelize(1 to 10000, numTasks).map { i => Thread.sleep(10); i }.countAsync()
+    // Wait until one task has started (because we want to make sure that any tasks that are started
+    // have corresponding end events sent to the listener).
+    var finishTime = System.currentTimeMillis + WAIT_TIMEOUT_MILLIS
+    listener.synchronized {
+      var remainingWait = finishTime - System.currentTimeMillis
+      while (listener.startedTasks.isEmpty && remainingWait > 0) {
+        listener.wait(remainingWait)
+        remainingWait = finishTime - System.currentTimeMillis
+      }
+      assert(!listener.startedTasks.isEmpty)
+    }
+
+    f.cancel()
+
+    // Ensure that onTaskEnd is called for all started tasks.
+    finishTime = System.currentTimeMillis + WAIT_TIMEOUT_MILLIS
+    listener.synchronized {
+      var remainingWait = finishTime - System.currentTimeMillis
+      while (listener.endedTasks.size < listener.startedTasks.size && remainingWait > 0) {
+        listener.wait(finishTime - System.currentTimeMillis)
+        remainingWait = finishTime - System.currentTimeMillis
+      }
+      assert(listener.endedTasks.size === listener.startedTasks.size)
+    }
+  }
+
   def checkNonZeroAvg(m: Traversable[Long], msg: String) {
     assert(m.sum / m.size.toDouble > 0.0, msg)
   }
@@ -184,12 +217,14 @@ class SparkListenerSuite extends FunSuite with LocalSparkContext with ShouldMatc
     val startedGettingResultTasks = new HashSet[Int]()
     val endedTasks = new HashSet[Int]()
 
-    override def onTaskStart(taskStart: SparkListenerTaskStart) {
+    override def onTaskStart(taskStart: SparkListenerTaskStart) = synchronized {
       startedTasks += taskStart.taskInfo.index
+      notify()
     }
 
-    override def onTaskEnd(taskEnd: SparkListenerTaskEnd) {
-        endedTasks += taskEnd.taskInfo.index
+    override def onTaskEnd(taskEnd: SparkListenerTaskEnd) = synchronized {
+      endedTasks += taskEnd.taskInfo.index
+      notify()
     }
 
     override def onTaskGettingResult(taskGettingResult: SparkListenerTaskGettingResult) {
diff --git a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
index c9f6cc5d079b5..de321c45b547c 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/TaskSetManagerSuite.scala
@@ -269,7 +269,7 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
     assert(manager.resourceOffer("exec1", "host1", 1, ANY).get.index === 0)
 
     // Tell it the task has finished but the result was lost.
-    manager.handleFailedTask(0, TaskState.FINISHED, Some(TaskResultLost))
+    manager.handleFailedTask(0, TaskState.FINISHED, TaskResultLost)
     assert(sched.endedTasks(0) === TaskResultLost)
 
     // Re-offer the host -- now we should get task 0 again.
@@ -287,10 +287,10 @@ class TaskSetManagerSuite extends FunSuite with LocalSparkContext with Logging {
     // after the last failure.
     (1 to manager.maxTaskFailures).foreach { index =>
       val offerResult = manager.resourceOffer("exec1", "host1", 1, ANY)
-      assert(offerResult != None,
+      assert(offerResult.isDefined,
         "Expect resource offer on iteration %s to return a task".format(index))
       assert(offerResult.get.index === 0)
-      manager.handleFailedTask(offerResult.get.taskId, TaskState.FINISHED, Some(TaskResultLost))
+      manager.handleFailedTask(offerResult.get.taskId, TaskState.FINISHED, TaskResultLost)
       if (index < MAX_TASK_FAILURES) {
         assert(!sched.taskSetsFailed.contains(taskSet.id))
       } else {
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index 18aa587662d24..85011c6451777 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -137,9 +137,9 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     store.putSingle("a3", a3, StorageLevel.MEMORY_ONLY, tellMaster = false)
 
     // Checking whether blocks are in memory
-    assert(store.getSingle("a1") != None, "a1 was not in store")
-    assert(store.getSingle("a2") != None, "a2 was not in store")
-    assert(store.getSingle("a3") != None, "a3 was not in store")
+    assert(store.getSingle("a1").isDefined, "a1 was not in store")
+    assert(store.getSingle("a2").isDefined, "a2 was not in store")
+    assert(store.getSingle("a3").isDefined, "a3 was not in store")
 
     // Checking whether master knows about the blocks or not
     assert(master.getLocations("a1").size > 0, "master was not told about a1")
@@ -186,9 +186,9 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     val memStatus = master.getMemoryStatus.head._2
     assert(memStatus._1 == 2000L, "total memory " + memStatus._1 + " should equal 2000")
     assert(memStatus._2 <= 1200L, "remaining memory " + memStatus._2 + " should <= 1200")
-    assert(store.getSingle("a1-to-remove") != None, "a1 was not in store")
-    assert(store.getSingle("a2-to-remove") != None, "a2 was not in store")
-    assert(store.getSingle("a3-to-remove") != None, "a3 was not in store")
+    assert(store.getSingle("a1-to-remove").isDefined, "a1 was not in store")
+    assert(store.getSingle("a2-to-remove").isDefined, "a2 was not in store")
+    assert(store.getSingle("a3-to-remove").isDefined, "a3 was not in store")
 
     // Checking whether master knows about the blocks or not
     assert(master.getLocations("a1-to-remove").size > 0, "master was not told about a1")
@@ -259,7 +259,7 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
 
     store.putSingle("a1", a1, StorageLevel.MEMORY_ONLY)
 
-    assert(store.getSingle("a1") != None, "a1 was not in store")
+    assert(store.getSingle("a1").isDefined, "a1 was not in store")
     assert(master.getLocations("a1").size > 0, "master was not told about a1")
 
     master.removeExecutor(store.blockManagerId.executorId)
@@ -333,14 +333,14 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     store.putSingle("a1", a1, StorageLevel.MEMORY_ONLY)
     store.putSingle("a2", a2, StorageLevel.MEMORY_ONLY)
     store.putSingle("a3", a3, StorageLevel.MEMORY_ONLY)
-    assert(store.getSingle("a2") != None, "a2 was not in store")
-    assert(store.getSingle("a3") != None, "a3 was not in store")
+    assert(store.getSingle("a2").isDefined, "a2 was not in store")
+    assert(store.getSingle("a3").isDefined, "a3 was not in store")
     assert(store.getSingle("a1") === None, "a1 was in store")
-    assert(store.getSingle("a2") != None, "a2 was not in store")
+    assert(store.getSingle("a2").isDefined, "a2 was not in store")
     // At this point a2 was gotten last, so LRU will getSingle rid of a3
     store.putSingle("a1", a1, StorageLevel.MEMORY_ONLY)
-    assert(store.getSingle("a1") != None, "a1 was not in store")
-    assert(store.getSingle("a2") != None, "a2 was not in store")
+    assert(store.getSingle("a1").isDefined, "a1 was not in store")
+    assert(store.getSingle("a2").isDefined, "a2 was not in store")
     assert(store.getSingle("a3") === None, "a3 was in store")
   }
 
@@ -352,14 +352,14 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     store.putSingle("a1", a1, StorageLevel.MEMORY_ONLY_SER)
     store.putSingle("a2", a2, StorageLevel.MEMORY_ONLY_SER)
     store.putSingle("a3", a3, StorageLevel.MEMORY_ONLY_SER)
-    assert(store.getSingle("a2") != None, "a2 was not in store")
-    assert(store.getSingle("a3") != None, "a3 was not in store")
+    assert(store.getSingle("a2").isDefined, "a2 was not in store")
+    assert(store.getSingle("a3").isDefined, "a3 was not in store")
     assert(store.getSingle("a1") === None, "a1 was in store")
-    assert(store.getSingle("a2") != None, "a2 was not in store")
+    assert(store.getSingle("a2").isDefined, "a2 was not in store")
     // At this point a2 was gotten last, so LRU will getSingle rid of a3
     store.putSingle("a1", a1, StorageLevel.MEMORY_ONLY_SER)
-    assert(store.getSingle("a1") != None, "a1 was not in store")
-    assert(store.getSingle("a2") != None, "a2 was not in store")
+    assert(store.getSingle("a1").isDefined, "a1 was not in store")
+    assert(store.getSingle("a2").isDefined, "a2 was not in store")
     assert(store.getSingle("a3") === None, "a3 was in store")
   }
 
@@ -374,8 +374,8 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     // Even though we accessed rdd_0_3 last, it should not have replaced partitions 1 and 2
     // from the same RDD
     assert(store.getSingle(rdd(0, 3)) === None, "rdd_0_3 was in store")
-    assert(store.getSingle(rdd(0, 2)) != None, "rdd_0_2 was not in store")
-    assert(store.getSingle(rdd(0, 1)) != None, "rdd_0_1 was not in store")
+    assert(store.getSingle(rdd(0, 2)).isDefined, "rdd_0_2 was not in store")
+    assert(store.getSingle(rdd(0, 1)).isDefined, "rdd_0_1 was not in store")
     // Check that rdd_0_3 doesn't replace them even after further accesses
     assert(store.getSingle(rdd(0, 3)) === None, "rdd_0_3 was in store")
     assert(store.getSingle(rdd(0, 3)) === None, "rdd_0_3 was in store")
@@ -392,7 +392,7 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     assert(!store.memoryStore.contains(rdd(0, 1)), "rdd_0_1 was in store")
     assert(store.memoryStore.contains(rdd(0, 2)), "rdd_0_2 was not in store")
     // Do a get() on rdd_0_2 so that it is the most recently used item
-    assert(store.getSingle(rdd(0, 2)) != None, "rdd_0_2 was not in store")
+    assert(store.getSingle(rdd(0, 2)).isDefined, "rdd_0_2 was not in store")
     // Put in more partitions from RDD 0; they should replace rdd_1_1
     store.putSingle(rdd(0, 3), new Array[Byte](400), StorageLevel.MEMORY_ONLY)
     store.putSingle(rdd(0, 4), new Array[Byte](400), StorageLevel.MEMORY_ONLY)
@@ -413,9 +413,9 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     store.putSingle("a1", a1, StorageLevel.DISK_ONLY)
     store.putSingle("a2", a2, StorageLevel.DISK_ONLY)
     store.putSingle("a3", a3, StorageLevel.DISK_ONLY)
-    assert(store.getSingle("a2") != None, "a2 was in store")
-    assert(store.getSingle("a3") != None, "a3 was in store")
-    assert(store.getSingle("a1") != None, "a1 was in store")
+    assert(store.getSingle("a2").isDefined, "a2 was in store")
+    assert(store.getSingle("a3").isDefined, "a3 was in store")
+    assert(store.getSingle("a1").isDefined, "a1 was in store")
   }
 
   test("disk and memory storage") {
@@ -426,11 +426,11 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     store.putSingle("a1", a1, StorageLevel.MEMORY_AND_DISK)
     store.putSingle("a2", a2, StorageLevel.MEMORY_AND_DISK)
     store.putSingle("a3", a3, StorageLevel.MEMORY_AND_DISK)
-    assert(store.getSingle("a2") != None, "a2 was not in store")
-    assert(store.getSingle("a3") != None, "a3 was not in store")
+    assert(store.getSingle("a2").isDefined, "a2 was not in store")
+    assert(store.getSingle("a3").isDefined, "a3 was not in store")
     assert(store.memoryStore.getValues("a1") == None, "a1 was in memory store")
-    assert(store.getSingle("a1") != None, "a1 was not in store")
-    assert(store.memoryStore.getValues("a1") != None, "a1 was not in memory store")
+    assert(store.getSingle("a1").isDefined, "a1 was not in store")
+    assert(store.memoryStore.getValues("a1").isDefined, "a1 was not in memory store")
   }
 
   test("disk and memory storage with getLocalBytes") {
@@ -441,11 +441,11 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     store.putSingle("a1", a1, StorageLevel.MEMORY_AND_DISK)
     store.putSingle("a2", a2, StorageLevel.MEMORY_AND_DISK)
     store.putSingle("a3", a3, StorageLevel.MEMORY_AND_DISK)
-    assert(store.getLocalBytes("a2") != None, "a2 was not in store")
-    assert(store.getLocalBytes("a3") != None, "a3 was not in store")
+    assert(store.getLocalBytes("a2").isDefined, "a2 was not in store")
+    assert(store.getLocalBytes("a3").isDefined, "a3 was not in store")
     assert(store.memoryStore.getValues("a1") == None, "a1 was in memory store")
-    assert(store.getLocalBytes("a1") != None, "a1 was not in store")
-    assert(store.memoryStore.getValues("a1") != None, "a1 was not in memory store")
+    assert(store.getLocalBytes("a1").isDefined, "a1 was not in store")
+    assert(store.memoryStore.getValues("a1").isDefined, "a1 was not in memory store")
   }
 
   test("disk and memory storage with serialization") {
@@ -456,11 +456,11 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     store.putSingle("a1", a1, StorageLevel.MEMORY_AND_DISK_SER)
     store.putSingle("a2", a2, StorageLevel.MEMORY_AND_DISK_SER)
     store.putSingle("a3", a3, StorageLevel.MEMORY_AND_DISK_SER)
-    assert(store.getSingle("a2") != None, "a2 was not in store")
-    assert(store.getSingle("a3") != None, "a3 was not in store")
+    assert(store.getSingle("a2").isDefined, "a2 was not in store")
+    assert(store.getSingle("a3").isDefined, "a3 was not in store")
     assert(store.memoryStore.getValues("a1") == None, "a1 was in memory store")
-    assert(store.getSingle("a1") != None, "a1 was not in store")
-    assert(store.memoryStore.getValues("a1") != None, "a1 was not in memory store")
+    assert(store.getSingle("a1").isDefined, "a1 was not in store")
+    assert(store.memoryStore.getValues("a1").isDefined, "a1 was not in memory store")
   }
 
   test("disk and memory storage with serialization and getLocalBytes") {
@@ -471,11 +471,11 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     store.putSingle("a1", a1, StorageLevel.MEMORY_AND_DISK_SER)
     store.putSingle("a2", a2, StorageLevel.MEMORY_AND_DISK_SER)
     store.putSingle("a3", a3, StorageLevel.MEMORY_AND_DISK_SER)
-    assert(store.getLocalBytes("a2") != None, "a2 was not in store")
-    assert(store.getLocalBytes("a3") != None, "a3 was not in store")
+    assert(store.getLocalBytes("a2").isDefined, "a2 was not in store")
+    assert(store.getLocalBytes("a3").isDefined, "a3 was not in store")
     assert(store.memoryStore.getValues("a1") == None, "a1 was in memory store")
-    assert(store.getLocalBytes("a1") != None, "a1 was not in store")
-    assert(store.memoryStore.getValues("a1") != None, "a1 was not in memory store")
+    assert(store.getLocalBytes("a1").isDefined, "a1 was not in store")
+    assert(store.memoryStore.getValues("a1").isDefined, "a1 was not in memory store")
   }
 
   test("LRU with mixed storage levels") {
@@ -489,18 +489,18 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     store.putSingle("a2", a2, StorageLevel.MEMORY_ONLY_SER)
     store.putSingle("a3", a3, StorageLevel.DISK_ONLY)
     // At this point LRU should not kick in because a3 is only on disk
-    assert(store.getSingle("a1") != None, "a2 was not in store")
-    assert(store.getSingle("a2") != None, "a3 was not in store")
-    assert(store.getSingle("a3") != None, "a1 was not in store")
-    assert(store.getSingle("a1") != None, "a2 was not in store")
-    assert(store.getSingle("a2") != None, "a3 was not in store")
-    assert(store.getSingle("a3") != None, "a1 was not in store")
+    assert(store.getSingle("a1").isDefined, "a2 was not in store")
+    assert(store.getSingle("a2").isDefined, "a3 was not in store")
+    assert(store.getSingle("a3").isDefined, "a1 was not in store")
+    assert(store.getSingle("a1").isDefined, "a2 was not in store")
+    assert(store.getSingle("a2").isDefined, "a3 was not in store")
+    assert(store.getSingle("a3").isDefined, "a1 was not in store")
     // Now let's add in a4, which uses both disk and memory; a1 should drop out
     store.putSingle("a4", a4, StorageLevel.MEMORY_AND_DISK_SER)
     assert(store.getSingle("a1") == None, "a1 was in store")
-    assert(store.getSingle("a2") != None, "a2 was not in store")
-    assert(store.getSingle("a3") != None, "a3 was not in store")
-    assert(store.getSingle("a4") != None, "a4 was not in store")
+    assert(store.getSingle("a2").isDefined, "a2 was not in store")
+    assert(store.getSingle("a3").isDefined, "a3 was not in store")
+    assert(store.getSingle("a4").isDefined, "a4 was not in store")
   }
 
   test("in-memory LRU with streams") {
@@ -511,18 +511,18 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     store.put("list1", list1.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
     store.put("list2", list2.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
     store.put("list3", list3.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
-    assert(store.get("list2") != None, "list2 was not in store")
+    assert(store.get("list2").isDefined, "list2 was not in store")
     assert(store.get("list2").get.size == 2)
-    assert(store.get("list3") != None, "list3 was not in store")
+    assert(store.get("list3").isDefined, "list3 was not in store")
     assert(store.get("list3").get.size == 2)
     assert(store.get("list1") === None, "list1 was in store")
-    assert(store.get("list2") != None, "list2 was not in store")
+    assert(store.get("list2").isDefined, "list2 was not in store")
     assert(store.get("list2").get.size == 2)
     // At this point list2 was gotten last, so LRU will getSingle rid of list3
     store.put("list1", list1.iterator, StorageLevel.MEMORY_ONLY, tellMaster = true)
-    assert(store.get("list1") != None, "list1 was not in store")
+    assert(store.get("list1").isDefined, "list1 was not in store")
     assert(store.get("list1").get.size == 2)
-    assert(store.get("list2") != None, "list2 was not in store")
+    assert(store.get("list2").isDefined, "list2 was not in store")
     assert(store.get("list2").get.size == 2)
     assert(store.get("list3") === None, "list1 was in store")
   }
@@ -538,26 +538,26 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     store.put("list2", list2.iterator, StorageLevel.MEMORY_ONLY_SER, tellMaster = true)
     store.put("list3", list3.iterator, StorageLevel.DISK_ONLY, tellMaster = true)
     // At this point LRU should not kick in because list3 is only on disk
-    assert(store.get("list1") != None, "list2 was not in store")
+    assert(store.get("list1").isDefined, "list2 was not in store")
     assert(store.get("list1").get.size === 2)
-    assert(store.get("list2") != None, "list3 was not in store")
+    assert(store.get("list2").isDefined, "list3 was not in store")
     assert(store.get("list2").get.size === 2)
-    assert(store.get("list3") != None, "list1 was not in store")
+    assert(store.get("list3").isDefined, "list1 was not in store")
     assert(store.get("list3").get.size === 2)
-    assert(store.get("list1") != None, "list2 was not in store")
+    assert(store.get("list1").isDefined, "list2 was not in store")
     assert(store.get("list1").get.size === 2)
-    assert(store.get("list2") != None, "list3 was not in store")
+    assert(store.get("list2").isDefined, "list3 was not in store")
     assert(store.get("list2").get.size === 2)
-    assert(store.get("list3") != None, "list1 was not in store")
+    assert(store.get("list3").isDefined, "list1 was not in store")
     assert(store.get("list3").get.size === 2)
     // Now let's add in list4, which uses both disk and memory; list1 should drop out
     store.put("list4", list4.iterator, StorageLevel.MEMORY_AND_DISK_SER, tellMaster = true)
     assert(store.get("list1") === None, "list1 was in store")
-    assert(store.get("list2") != None, "list3 was not in store")
+    assert(store.get("list2").isDefined, "list3 was not in store")
     assert(store.get("list2").get.size === 2)
-    assert(store.get("list3") != None, "list1 was not in store")
+    assert(store.get("list3").isDefined, "list1 was not in store")
     assert(store.get("list3").get.size === 2)
-    assert(store.get("list4") != None, "list4 was not in store")
+    assert(store.get("list4").isDefined, "list4 was not in store")
     assert(store.get("list4").get.size === 2)
   }
 
@@ -579,7 +579,7 @@ class BlockManagerSuite extends FunSuite with BeforeAndAfter with PrivateMethodT
     assert(store.getSingle("a1") === None, "a1 was in store")
     store.putSingle("a2", new Array[Byte](1000), StorageLevel.MEMORY_AND_DISK)
     assert(store.memoryStore.getValues("a2") === None, "a2 was in memory store")
-    assert(store.getSingle("a2") != None, "a2 was not in store")
+    assert(store.getSingle("a2").isDefined, "a2 was not in store")
   }
 
   test("block compression") {
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
index c3391f3e535bc..fce1184d46364 100644
--- a/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalAppendOnlyMapSuite.scala
@@ -1,22 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.util.collection
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.scalatest.{BeforeAndAfter, FunSuite}
+import org.scalatest.FunSuite
 
 import org.apache.spark._
 import org.apache.spark.SparkContext._
 
-class ExternalAppendOnlyMapSuite extends FunSuite with BeforeAndAfter with LocalSparkContext {
+class ExternalAppendOnlyMapSuite extends FunSuite with LocalSparkContext {
 
-  private val createCombiner: (Int => ArrayBuffer[Int]) = i => ArrayBuffer[Int](i)
-  private val mergeValue: (ArrayBuffer[Int], Int) => ArrayBuffer[Int] = (buffer, i) => {
-    buffer += i
-  }
-  private val mergeCombiners: (ArrayBuffer[Int], ArrayBuffer[Int]) => ArrayBuffer[Int] =
-    (buf1, buf2) => {
-      buf1 ++= buf2
-    }
+  private def createCombiner(i: Int) = ArrayBuffer[Int](i)
+  private def mergeValue(buffer: ArrayBuffer[Int], i: Int) = buffer += i
+  private def mergeCombiners(buf1: ArrayBuffer[Int], buf2: ArrayBuffer[Int]) = buf1 ++= buf2
 
   test("simple insert") {
     val conf = new SparkConf(false)
@@ -186,13 +198,13 @@ class ExternalAppendOnlyMapSuite extends FunSuite with BeforeAndAfter with Local
   }
 
   test("spilling") {
-    // TODO: Use SparkConf (which currently throws connection reset exception)
-    System.setProperty("spark.shuffle.memoryFraction", "0.001")
-    sc = new SparkContext("local-cluster[1,1,512]", "test")
+    val conf = new SparkConf(true)  // Load defaults, otherwise SPARK_HOME is not found
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
 
     // reduceByKey - should spill ~8 times
     val rddA = sc.parallelize(0 until 100000).map(i => (i/2, i))
-    val resultA = rddA.reduceByKey(math.max(_, _)).collect()
+    val resultA = rddA.reduceByKey(math.max).collect()
     assert(resultA.length == 50000)
     resultA.foreach { case(k, v) =>
       k match {
@@ -235,7 +247,73 @@ class ExternalAppendOnlyMapSuite extends FunSuite with BeforeAndAfter with Local
         case _ =>
       }
     }
+  }
+
+  test("spilling with hash collisions") {
+    val conf = new SparkConf(true)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+
+    def createCombiner(i: String) = ArrayBuffer[String](i)
+    def mergeValue(buffer: ArrayBuffer[String], i: String) = buffer += i
+    def mergeCombiners(buffer1: ArrayBuffer[String], buffer2: ArrayBuffer[String]) =
+      buffer1 ++= buffer2
+
+    val map = new ExternalAppendOnlyMap[String, String, ArrayBuffer[String]](
+      createCombiner, mergeValue, mergeCombiners)
+
+    val collisionPairs = Seq(
+      ("Aa", "BB"),                   // 2112
+      ("to", "v1"),                   // 3707
+      ("variants", "gelato"),         // -1249574770
+      ("Teheran", "Siblings"),        // 231609873
+      ("misused", "horsemints"),      // 1069518484
+      ("isohel", "epistolaries"),     // -1179291542
+      ("righto", "buzzards"),         // -931102253
+      ("hierarch", "crinolines"),     // -1732884796
+      ("inwork", "hypercatalexes"),   // -1183663690
+      ("wainages", "presentencing"),  // 240183619
+      ("trichothecenes", "locular"),  // 339006536
+      ("pomatoes", "eructation")      // 568647356
+    )
+
+    (1 to 100000).map(_.toString).foreach { i => map.insert(i, i) }
+    collisionPairs.foreach { case (w1, w2) =>
+      map.insert(w1, w2)
+      map.insert(w2, w1)
+    }
+
+    // A map of collision pairs in both directions
+    val collisionPairsMap = (collisionPairs ++ collisionPairs.map(_.swap)).toMap
+
+    // Avoid map.size or map.iterator.length because this destructively sorts the underlying map
+    var count = 0
 
-    System.clearProperty("spark.shuffle.memoryFraction")
+    val it = map.iterator
+    while (it.hasNext) {
+      val kv = it.next()
+      val expectedValue = ArrayBuffer[String](collisionPairsMap.getOrElse(kv._1, kv._1))
+      assert(kv._2.equals(expectedValue))
+      count += 1
+    }
+    assert(count == 100000 + collisionPairs.size * 2)
+  }
+
+  test("spilling with hash collisions using the Int.MaxValue key") {
+    val conf = new SparkConf(true)
+    conf.set("spark.shuffle.memoryFraction", "0.001")
+    sc = new SparkContext("local-cluster[1,1,512]", "test", conf)
+
+    val map = new ExternalAppendOnlyMap[Int, Int, ArrayBuffer[Int]](createCombiner,
+      mergeValue, mergeCombiners)
+
+    (1 to 100000).foreach { i => map.insert(i, i) }
+    map.insert(Int.MaxValue, Int.MaxValue)
+
+    val it = map.iterator
+    while (it.hasNext) {
+      // Should not throw NoSuchElementException
+      it.next()
+    }
   }
 }
diff --git a/dev/README.md b/dev/README.md
new file mode 100644
index 0000000000000..2b0f3d8ee8924
--- /dev/null
+++ b/dev/README.md
@@ -0,0 +1,5 @@
+# Spark Developer Scripts
+This directory contains scripts useful to developers when packaging,
+testing, or committing to Spark.
+
+Many of these scripts require Apache credentials to work correctly.
diff --git a/repl-bin/src/deb/bin/spark-shell b/dev/run-tests
old mode 100755
new mode 100644
similarity index 55%
rename from repl-bin/src/deb/bin/spark-shell
rename to dev/run-tests
index 118349d7c30f2..a5dcacb4fd0c1
--- a/repl-bin/src/deb/bin/spark-shell
+++ b/dev/run-tests
@@ -17,5 +17,25 @@
 # limitations under the License.
 #
 
-FWDIR="$(cd `dirname $0`; pwd)"
-exec $FWDIR/run org.apache.spark.repl.Main "$@"
+# Go to the Spark project root directory
+FWDIR="$(cd `dirname $0`/..; pwd)"
+cd $FWDIR
+
+# Remove work directory
+rm -rf ./work
+
+# Fail fast
+set -e
+
+echo "========================================================================="
+echo "Running Spark unit tests"
+echo "========================================================================="
+sbt/sbt assembly test
+
+echo "========================================================================="
+echo "Running PySpark tests"
+echo "========================================================================="
+if [ -z "$PYSPARK_PYTHON" ]; then
+  export PYSPARK_PYTHON=/usr/local/bin/python2.7
+fi
+./python/run-tests
diff --git a/docker/spark-test/base/Dockerfile b/docker/spark-test/base/Dockerfile
index 60962776dda57..e543db6143e4d 100644
--- a/docker/spark-test/base/Dockerfile
+++ b/docker/spark-test/base/Dockerfile
@@ -25,8 +25,7 @@ RUN apt-get update
 # install a few other useful packages plus Open Jdk 7
 RUN apt-get install -y less openjdk-7-jre-headless net-tools vim-tiny sudo openssh-server
 
-ENV SCALA_VERSION 2.9.3
-ENV SPARK_VERSION 0.8.1
+ENV SCALA_VERSION 2.10.3
 ENV CDH_VERSION cdh4
 ENV SCALA_HOME /opt/scala-$SCALA_VERSION
 ENV SPARK_HOME /opt/spark
diff --git a/docs/README.md b/docs/README.md
index dfcf7535538f0..d35990c03ef15 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,6 +1,6 @@
 Welcome to the Spark documentation!
 
-This readme will walk you through navigating and building the Spark documentation, which is included here with the Spark source code. You can also find documentation specific to release versions of Spark at http://spark.incubator.apache.org/documentation.html.
+This readme will walk you through navigating and building the Spark documentation, which is included here with the Spark source code. You can also find documentation specific to release versions of Spark at http://spark.apache.org/documentation.html.
 
 Read on to learn more about viewing documentation in plain text (i.e., markdown) or building the documentation yourself. Why build it yourself? So that you have the docs that corresponds to whichever version of Spark you currently have checked out of revision control.
 
@@ -10,9 +10,21 @@ We include the Spark documentation as part of the source (as opposed to using a
 
 In this directory you will find textfiles formatted using Markdown, with an ".md" suffix. You can read those text files directly if you want. Start with index.md.
 
-To make things quite a bit prettier and make the links easier to follow, generate the html version of the documentation based on the src directory by running `jekyll` in the docs directory. Use the command `SKIP_SCALADOC=1 jekyll` to skip building and copying over the scaladoc which can be timely. To use the `jekyll` command, you will need to have Jekyll installed, the easiest way to do this is via a Ruby Gem, see the [jekyll installation instructions](https://github.com/mojombo/jekyll/wiki/install). This will create a directory called _site containing index.html as well as the rest of the compiled files. Read more about Jekyll at https://github.com/mojombo/jekyll/wiki.
-
-In addition to generating the site as html from the markdown files, jekyll can serve up the site via a webserver. To build and run a webserver use the command `jekyll --server` which (currently) runs the webserver on port 4000, then visit the site at http://localhost:4000.
+The markdown code can be compiled to HTML using the 
+[Jekyll tool](http://jekyllrb.com). To use the `jekyll` command, you will 
+need to have Jekyll installed. The easiest way to do this is via a Ruby Gem, see the 
+[jekyll installation instructions](http://jekyllrb.com/docs/installation). 
+Compiling the site with Jekyll will create a directory called 
+_site containing index.html as well as the rest of the compiled files.
+
+You can modify the default Jekyll build as follows:
+
+    # Skip generating API docs (which takes a while)
+    $ SKIP_SCALADOC=1 jekyll build
+    # Serve content locally on port 4000
+    $ jekyll serve --watch
+    # Build the site with extra features used on the live page
+    $ PRODUCTION=1 jekyll build
 
 ## Pygments
 
diff --git a/docs/_config.yml b/docs/_config.yml
index ce0fdf5fb4f03..a7f46ffa7b810 100644
--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -3,8 +3,10 @@ markdown: kramdown
 
 # These allow the documentation to be updated with nerw releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 0.9.0-incubating-SNAPSHOT
+SPARK_VERSION: 0.9.0-incubating
 SPARK_VERSION_SHORT: 0.9.0
-SCALA_VERSION: "2.10"
+SCALA_BINARY_VERSION: "2.10"
+SCALA_VERSION: "2.10.3"
 MESOS_VERSION: 0.13.0
 SPARK_ISSUE_TRACKER_URL: https://spark-project.atlassian.net
+SPARK_GITHUB_URL: https://github.com/apache/spark
diff --git a/docs/_layouts/global.html b/docs/_layouts/global.html
index c529d89ffd192..c487dd9cdee3c 100755
--- a/docs/_layouts/global.html
+++ b/docs/_layouts/global.html
@@ -24,9 +24,9 @@
 
         <link rel="stylesheet" href="css/pygments-default.css">
 
+        {% production %}
         <!-- Google analytics script -->
         <script type="text/javascript">
-          /*
           var _gaq = _gaq || [];
           _gaq.push(['_setAccount', 'UA-32518208-1']);
           _gaq.push(['_trackPageview']);
@@ -36,8 +36,8 @@
             ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
             var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
           })();
-          */
         </script>
+        {% endproduction %}
 
     </head>
     <body>
@@ -82,6 +82,17 @@
                                 <li><a href="api/mllib/index.html#org.apache.spark.mllib.package">MLlib (Machine Learning)</a></li>
                                 <li><a href="api/bagel/index.html#org.apache.spark.bagel.package">Bagel (Pregel on Spark)</a></li>
                                 <li><a href="api/graphx/index.html#org.apache.spark.graphx.package">GraphX (Graph Processing)</a></li>
+                                <li class="divider"></li>
+                                <li class="dropdown-submenu">
+                                    <a tabindex="-1" href="#">External Data Sources</a>
+                                    <ul class="dropdown-menu">
+                                        <li><a href="api/external/kafka/index.html#org.apache.spark.streaming.kafka.KafkaUtils$">Kafka</a></li>
+                                        <li><a href="api/external/flume/index.html#org.apache.spark.streaming.flume.FlumeUtils$">Flume</a></li>
+                                        <li><a href="api/external/twitter/index.html#org.apache.spark.streaming.twitter.TwitterUtils$">Twitter</a></li>
+                                        <li><a href="api/external/zeromq/index.html#org.apache.spark.streaming.zeromq.ZeroMQUtils$">ZeroMQ</a></li>
+                                        <li><a href="api/external/mqtt/index.html#org.apache.spark.streaming.mqtt.MQTTUtils$">MQTT</a></li>
+                                    </ul>
+                                </li>
                             </ul>
                         </li>
 
@@ -148,16 +159,6 @@ <h2>Heading</h2>
 
             <hr>-->
 
-            <footer>
-              <hr>
-              <p style="text-align: center; veritcal-align: middle; color: #999;">
-                Apache Spark is an effort undergoing incubation at the Apache Software Foundation.
-                <a href="http://incubator.apache.org">
-                  <img style="margin-left: 20px;" src="img/incubator-logo.png" />
-                </a>
-              </p>
-            </footer>
-
         </div> <!-- /container -->
 
         <script src="js/vendor/jquery-1.8.0.min.js"></script>
diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb
index acc6bf08160eb..44d64057f4fb3 100644
--- a/docs/_plugins/copy_api_dirs.rb
+++ b/docs/_plugins/copy_api_dirs.rb
@@ -20,7 +20,10 @@
 
 if not (ENV['SKIP_API'] == '1' or ENV['SKIP_SCALADOC'] == '1')
   # Build Scaladoc for Java/Scala
-  projects = ["core", "examples", "repl", "bagel", "graphx", "streaming", "mllib"]
+  core_projects = ["core", "examples", "repl", "bagel", "graphx", "streaming", "mllib"]
+  external_projects = ["flume", "kafka", "mqtt", "twitter", "zeromq"]
+
+  projects = core_projects + external_projects.map { |project_name| "external/" + project_name }
 
   puts "Moving to project root and building scaladoc."
   curr_dir = pwd
diff --git a/docs/_plugins/production_tag.rb b/docs/_plugins/production_tag.rb
new file mode 100644
index 0000000000000..9f870cf2137af
--- /dev/null
+++ b/docs/_plugins/production_tag.rb
@@ -0,0 +1,14 @@
+module Jekyll
+  class ProductionTag < Liquid::Block
+
+    def initialize(tag_name, markup, tokens)
+      super
+    end
+
+    def render(context)
+      if ENV['PRODUCTION'] then super else "" end
+    end
+  end
+end
+
+Liquid::Template.register_tag('production', Jekyll::ProductionTag)
diff --git a/docs/bagel-programming-guide.md b/docs/bagel-programming-guide.md
index cffa55ee952b0..da6d0c9dcd97b 100644
--- a/docs/bagel-programming-guide.md
+++ b/docs/bagel-programming-guide.md
@@ -16,7 +16,7 @@ This guide shows the programming model and features of Bagel by walking through
 To use Bagel in your program, add the following SBT or Maven dependency:
 
     groupId = org.apache.spark
-    artifactId = spark-bagel_{{site.SCALA_VERSION}}
+    artifactId = spark-bagel_{{site.SCALA_BINARY_VERSION}}
     version = {{site.SPARK_VERSION}}
 
 # Programming Model
@@ -108,7 +108,7 @@ _Example_
 
 ## Operations
 
-Here are the actions and types in the Bagel API. See [Bagel.scala](https://github.com/apache/incubator-spark/blob/master/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala) for details.
+Here are the actions and types in the Bagel API. See [Bagel.scala](https://github.com/apache/spark/blob/master/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala) for details.
 
 ### Actions
 
diff --git a/docs/building-with-maven.md b/docs/building-with-maven.md
index b9ff0af76f647..ded12926885b9 100644
--- a/docs/building-with-maven.md
+++ b/docs/building-with-maven.md
@@ -17,10 +17,10 @@ You'll need to configure Maven to use more memory than usual by setting `MAVEN_O
 
 If you don't run this, you may see errors like the following:
 
-    [INFO] Compiling 203 Scala sources and 9 Java sources to /Users/me/Development/spark/core/target/scala-{{site.SCALA_VERSION}}/classes...
+    [INFO] Compiling 203 Scala sources and 9 Java sources to /Users/me/Development/spark/core/target/scala-{{site.SCALA_BINARY_VERSION}}/classes...
     [ERROR] PermGen space -> [Help 1]
 
-    [INFO] Compiling 203 Scala sources and 9 Java sources to /Users/me/Development/spark/core/target/scala-{{site.SCALA_VERSION}}/classes...
+    [INFO] Compiling 203 Scala sources and 9 Java sources to /Users/me/Development/spark/core/target/scala-{{site.SCALA_BINARY_VERSION}}/classes...
     [ERROR] Java heap space -> [Help 1]
 
 You can fix this by setting the `MAVEN_OPTS` variable as discussed before.
@@ -71,8 +71,8 @@ This setup works fine in IntelliJ IDEA 11.1.4. After opening the project via the
 
 ## Building Spark Debian Packages ##
 
-It includes support for building a Debian package containing a 'fat-jar' which includes the repl, the examples and bagel. This can be created by specifying the following profiles:
+The maven build includes support for building a Debian package containing the assembly 'fat-jar', PySpark, and the necessary scripts and configuration files. This can be created by specifying the following:
 
-    $ mvn -Prepl-bin -Pdeb clean package
+    $ mvn -Pdeb -DskipTests clean package
 
-The debian package can then be found under repl/target. We added the short commit hash to the file name so that we can distinguish individual packages build for SNAPSHOT versions.
+The debian package can then be found under assembly/target. We added the short commit hash to the file name so that we can distinguish individual packages built for SNAPSHOT versions.
diff --git a/docs/configuration.md b/docs/configuration.md
index da70cabba2d9b..3f03d97e8054c 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -18,8 +18,8 @@ Spark provides three locations to configure the system:
 Spark properties control most application settings and are configured separately for each application.
 The preferred way to set them is by passing a [SparkConf](api/core/index.html#org.apache.spark.SparkConf)
 class to your SparkContext constructor.
-Alternatively, Spark will also load them from Java system properties (for compatibility with old versions
-of Spark) and from a [`spark.conf` file](#configuration-files) on your classpath.
+Alternatively, Spark will also load them from Java system properties, for compatibility with old versions
+of Spark.
 
 SparkConf lets you configure most of the common properties to initialize a cluster (e.g., master URL and
 application name), as well as arbitrary key-value pairs through the `set()` method. For example, we could
@@ -98,7 +98,7 @@ Apart from these, the following properties are also available, and may be useful
   <td>spark.default.parallelism</td>
   <td>8</td>
   <td>
-    Default number of tasks to use for distributed shuffle operations (<code>groupByKey</code>,
+    Default number of tasks to use across the cluster for distributed shuffle operations (<code>groupByKey</code>,
     <code>reduceByKey</code>, etc) when not set by user.
   </td>
 </tr>
@@ -360,7 +360,16 @@ Apart from these, the following properties are also available, and may be useful
   <td>spark.streaming.blockInterval</td>
   <td>200</td>
   <td>
-    Duration (milliseconds) of how long to batch new objects coming from network receivers.
+    Duration (milliseconds) of how long to batch new objects coming from network receivers used
+    in Spark Streaming.
+  </td>
+</tr>
+<tr>
+  <td>spark.streaming.unpersist</td>
+  <td>false</td>
+  <td>
+    Force RDDs generated and persisted by Spark Streaming to be automatically unpersisted from
+    Spark's memory. Setting this to true is likely to reduce Spark's RDD memory usage.
   </td>
 </tr>
 <tr>
@@ -379,13 +388,6 @@ Apart from these, the following properties are also available, and may be useful
     Too large a value decreases parallelism during broadcast (makes it slower); however, if it is too small, <code>BlockManager</code> might take a performance hit.
   </td>
 </tr>
-<tr>
-  <td>akka.x.y....</td>
-  <td>value</td>
-  <td>
-    An arbitrary akka configuration can be set directly on spark conf and it is applied for all the ActorSystems created spark wide for that SparkContext and its assigned executors as well.
-  </td>
-</tr>
 
 <tr>
   <td>spark.shuffle.consolidateFiles</td>
@@ -394,6 +396,14 @@ Apart from these, the following properties are also available, and may be useful
     If set to "true", consolidates intermediate files created during a shuffle. Creating fewer files can improve filesystem performance for shuffles with large numbers of reduce tasks. It is recommended to set this to "true" when using ext4 or xfs filesystems. On ext3, this option might degrade performance on machines with many (>8) cores due to filesystem limitations.
   </td>
 </tr>
+<tr>
+  <td>spark.shuffle.file.buffer.kb</td>
+  <td>100</td>
+  <td>
+    Size of the in-memory buffer for each shuffle file output stream, in kilobytes. These buffers
+    reduce the number of disk seeks and system calls made in creating intermediate shuffle files.
+  </td>
+</tr>
 <tr>
   <td>spark.shuffle.spill</td>
   <td>true</td>
@@ -468,30 +478,6 @@ Apart from these, the following properties are also available, and may be useful
 The application web UI at `http://<driver>:4040` lists Spark properties in the "Environment" tab.
 This is a useful place to check to make sure that your properties have been set correctly.
 
-## Configuration Files
-
-You can also configure Spark properties through a `spark.conf` file on your Java classpath.
-Because these properties are usually application-specific, we recommend putting this fine *only* on your
-application's classpath, and not in a global Spark classpath.
-
-The `spark.conf` file uses Typesafe Config's [HOCON format](https://github.com/typesafehub/config#json-superset),
-which is a superset of Java properties files and JSON. For example, the following is a simple config file:
-
-{% highlight awk %}
-# Comments are allowed
-spark.executor.memory = 512m
-spark.serializer = org.apache.spark.serializer.KryoSerializer
-{% endhighlight %}
-
-The format also allows hierarchical nesting, as follows:
-
-{% highlight awk %}
-spark.akka {
-  threads = 8
-  timeout = 200
-}
-{% endhighlight %}
-
 # Environment Variables
 
 Certain Spark settings can be configured through environment variables, which are read from the `conf/spark-env.sh`
diff --git a/docs/css/main.css b/docs/css/main.css
index 31122d5633801..8566400f071c9 100755
--- a/docs/css/main.css
+++ b/docs/css/main.css
@@ -87,20 +87,54 @@ a:hover code {
   max-width: 914px;
 }
 
-/**
- * Make dropdown menus in nav bars show on hover instead of click
- * using solution at http://stackoverflow.com/questions/8878033/how-
- * to-make-twitter-bootstrap-menu-dropdown-on-hover-rather-than-click
- **/
 .dropdown-menu {
   /* Remove the default 2px top margin which causes a small
     gap between the hover trigger area and the popup menu */
   margin-top: 0;
+  /* Avoid too much whitespace at the right for shorter menu items */
+  min-width: 50px;
 }
+
+/**
+ * Make dropdown menus in nav bars show on hover instead of click
+ * using solution at http://stackoverflow.com/questions/8878033/how-
+ * to-make-twitter-bootstrap-menu-dropdown-on-hover-rather-than-click
+ **/
 ul.nav li.dropdown:hover ul.dropdown-menu{
   display: block;
 }
+
 a.menu:after, .dropdown-toggle:after {
   content: none;
 }
 
+/** Make the submenus open on hover on the parent menu item */
+ul.nav li.dropdown ul.dropdown-menu li.dropdown-submenu:hover ul.dropdown-menu {
+  display: block;
+}
+
+/** Make the submenus be invisible until the parent menu item is hovered upon */
+ul.nav li.dropdown ul.dropdown-menu li.dropdown-submenu ul.dropdown-menu {
+  display: none;
+}
+
+/**
+ * Made the navigation bar buttons not grey out when clicked.
+ * Essentially making nav bar buttons not react to clicks, only hover events.
+ */
+.navbar .nav li.dropdown.open > .dropdown-toggle {
+  background-color: transparent;
+}
+
+/**
+ * Made the active tab caption blue. Otherwise the active tab is black, and inactive tab is blue.
+ * That looks weird. Changed the colors to active - blue, inactive - black, and
+ * no color change on hover.
+ */
+.nav-tabs > .active > a, .nav-tabs > .active > a:hover {
+  color: #08c;
+}
+
+.nav-tabs > li > a, .nav-tabs > li > a:hover {
+  color: #333;
+}
diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md
index 9fbde4eb09575..3dfed7bea9ea8 100644
--- a/docs/graphx-programming-guide.md
+++ b/docs/graphx-programming-guide.md
@@ -18,7 +18,7 @@ title: GraphX Programming Guide
 
 GraphX is the new (alpha) Spark API for graphs and graph-parallel computation. At a high-level,
 GraphX extends the Spark [RDD](api/core/index.html#org.apache.spark.rdd.RDD) by introducing the
-[Resilient Distributed property Graph (RDG)](#property_graph): a directed multigraph with properties
+[Resilient Distributed Property Graph](#property_graph): a directed multigraph with properties
 attached to each vertex and edge.  To support graph computation, GraphX exposes a set of fundamental
 operators (e.g., [subgraph](#structural_operators), [joinVertices](#join_operators), and
 [mapReduceTriplets](#mrTriplets)) as well as an optimized variant of the [Pregel](#pregel) API. In
@@ -29,7 +29,7 @@ addition, GraphX includes a growing collection of graph [algorithms](#graph_algo
 
 From social networks to language modeling, the growing scale and importance of
 graph data has driven the development of numerous new *graph-parallel* systems
-(e.g., [Giraph](http://http://giraph.apache.org) and
+(e.g., [Giraph](http://giraph.apache.org) and
 [GraphLab](http://graphlab.org)).  By restricting the types of computation that can be
 expressed and introducing new techniques to partition and distribute graphs,
 these systems can efficiently execute sophisticated graph algorithms orders of
@@ -43,12 +43,25 @@ magnitude faster than more general *data-parallel* systems.
   <!-- Images are downsized intentionally to improve quality on retina displays -->
 </p>
 
-However, the same restrictions that enable these substantial performance gains
-also make it difficult to express many of the important stages in a typical graph-analytics pipeline:
-constructing the graph, modifying its structure, or expressing computation that
-spans multiple graphs.  As a consequence, existing graph analytics pipelines
-compose graph-parallel and data-parallel systems, leading to extensive data
-movement and duplication and a complicated programming model.
+However, the same restrictions that enable these substantial performance gains also make it
+difficult to express many of the important stages in a typical graph-analytics pipeline:
+constructing the graph, modifying its structure, or expressing computation that spans multiple
+graphs.  Furthermore, how we look at data depends on our objectives and the same raw data may have
+many different table and graph views.
+
+<p style="text-align: center;">
+  <img src="img/tables_and_graphs.png"
+       title="Tables and Graphs"
+       alt="Tables and Graphs"
+       width="50%" />
+  <!-- Images are downsized intentionally to improve quality on retina displays -->
+</p>
+
+As a consequence, it is often necessary to be able to move between table and graph views of the same
+physical data and to leverage the properties of each view to easily and efficiently express
+computation.  However, existing graph analytics pipelines must compose graph-parallel and data-
+parallel systems, leading to extensive data movement and duplication and a complicated programming
+model.
 
 <p style="text-align: center;">
   <img src="img/graph_analytics_pipeline.png"
@@ -95,17 +108,20 @@ with user defined objects attached to each vertex and edge.  A directed multigra
 graph with potentially multiple parallel edges sharing the same source and destination vertex.  The
 ability to support parallel edges simplifies modeling scenarios where there can be multiple
 relationships (e.g., co-worker and friend) between the same vertices.  Each vertex is keyed by a
-*unique* 64-bit long identifier (`VertexId`).  Similarly, edges have corresponding source and
-destination vertex identifiers. GraphX does not impose any ordering or constraints on the vertex
-identifiers.  The property graph is parameterized over the vertex `VD` and edge `ED` types.  These
+*unique* 64-bit long identifier (`VertexID`).  GraphX does not impose any ordering constraints on
+the vertex identifiers.  Similarly, edges have corresponding source and destination vertex
+identifiers.
+
+The property graph is parameterized over the vertex (`VD`) and edge (`ED`) types.  These
 are the types of the objects associated with each vertex and edge respectively.
 
-> GraphX optimizes the representation of `VD` and `ED` when they are plain old data-types (e.g.,
-> int, double, etc...) reducing the in memory footprint.
+> GraphX optimizes the representation of vertex and edge types when they are plain old data-types
+> (e.g., int, double, etc...) reducing the in memory footprint by storing them in specialized
+> arrays.
 
-In some cases we may wish to have vertices with different property types in the same graph. This can
-be accomplished through inheritance.  For example to model users and products as a bipartite graph
-we might do the following:
+In some cases it may be desirable to have vertices with different property types in the same graph.
+This can be accomplished through inheritance.  For example to model users and products as a
+bipartite graph we might do the following:
 
 {% highlight scala %}
 class VertexProperty()
@@ -116,9 +132,11 @@ var graph: Graph[VertexProperty, String] = null
 {% endhighlight %}
 
 Like RDDs, property graphs are immutable, distributed, and fault-tolerant.  Changes to the values or
-structure of the graph are accomplished by producing a new graph with the desired changes. The graph
-is partitioned across the workers using a range of vertex-partitioning heuristics.  As with RDDs,
-each partition of the graph can be recreated on a different machine in the event of a failure.
+structure of the graph are accomplished by producing a new graph with the desired changes.  Note
+that substantial parts of the original graph (i.e., unaffected structure, attributes, and indicies)
+are reused in the new graph reducing the cost of this inherently functional data-structure.  The
+graph is partitioned across the workers using a range of vertex-partitioning heuristics.  As with
+RDDs, each partition of the graph can be recreated on a different machine in the event of a failure.
 
 Logically the property graph corresponds to a pair of typed collections (RDDs) encoding the
 properties for each vertex and edge.  As a consequence, the graph class contains members to access
@@ -131,12 +149,12 @@ class Graph[VD, ED] {
 }
 {% endhighlight %}
 
-The classes `VertexRDD[VD]` and `EdgeRDD[ED]` extend and are optimized versions of `RDD[(VertexId,
+The classes `VertexRDD[VD]` and `EdgeRDD[ED]` extend and are optimized versions of `RDD[(VertexID,
 VD)]` and `RDD[Edge[ED]]` respectively.  Both `VertexRDD[VD]` and `EdgeRDD[ED]` provide  additional
 functionality built around graph computation and leverage internal optimizations.  We discuss the
 `VertexRDD` and `EdgeRDD` API in greater detail in the section on [vertex and edge
 RDDs](#vertex_and_edge_rdds) but for now they can be thought of as simply RDDs of the form:
-`RDD[(VertexId, VD)]` and `RDD[Edge[ED]]`.
+`RDD[(VertexID, VD)]` and `RDD[Edge[ED]]`.
 
 ### Example Property Graph
 
@@ -168,7 +186,7 @@ code constructs a graph from a collection of RDDs:
 // Assume the SparkContext has already been constructed
 val sc: SparkContext
 // Create an RDD for the vertices
-val users: RDD[(VertexID, (String, String))] =
+val users: RDD[(VertexId, (String, String))] =
   sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
                        (5L, ("franklin", "prof")), (2L, ("istoica", "prof"))))
 // Create an RDD for edges
@@ -183,7 +201,7 @@ val graph = Graph(users, relationships, defaultUser)
 
 In the above example we make use of the [`Edge`][Edge] case class. Edges have a `srcId` and a
 `dstId` corresponding to the source and destination vertex identifiers. In addition, the `Edge`
-class contains the `attr` member which contains the edge property.
+class has an `attr` member which stores the edge property.
 
 [Edge]: api/graphx/index.html#org.apache.spark.graphx.Edge
 
@@ -199,7 +217,7 @@ graph.edges.filter(e => e.srcId > e.dstId).count
 {% endhighlight %}
 
 > Note that `graph.vertices` returns an `VertexRDD[(String, String)]` which extends
-> `RDD[(VertexId, (String, String))]` and so we use the scala `case` expression to deconstruct the
+> `RDD[(VertexID, (String, String))]` and so we use the scala `case` expression to deconstruct the
 > tuple.  On the other hand, `graph.edges` returns an `EdgeRDD` containing `Edge[String]` objects.
 > We could have also used the case class type constructor as in the following:
 > {% highlight scala %}
@@ -266,6 +284,75 @@ able to support different graph representations in the future.  Each graph repre
 provide implementations of the core operations and reuse many of the useful operations defined in
 [`GraphOps`][GraphOps].
 
+### Summary List of Operators
+The following is a quick summary of the functionality defined in both [`Graph`][Graph] and
+[`GraphOps`][GraphOps] but presented as members of Graph for simplicity. Note that some function
+signatures have been simplified (e.g., default arguments and type constraints removed) and some more
+advanced functionality has been removed so please consult the API docs for the official list of
+operations.
+
+{% highlight scala %}
+/** Summary of the functionality in the property graph */
+class Graph[VD, ED] {
+  // Information about the Graph ===================================================================
+  val numEdges: Long
+  val numVertices: Long
+  val inDegrees: VertexRDD[Int]
+  val outDegrees: VertexRDD[Int]
+  val degrees: VertexRDD[Int]
+  // Views of the graph as collections =============================================================
+  val vertices: VertexRDD[VD]
+  val edges: EdgeRDD[ED]
+  val triplets: RDD[EdgeTriplet[VD, ED]]
+  // Functions for caching graphs ==================================================================
+  def persist(newLevel: StorageLevel = StorageLevel.MEMORY_ONLY): Graph[VD, ED]
+  def cache(): Graph[VD, ED]
+  def unpersistVertices(blocking: Boolean = true): Graph[VD, ED]
+  // Change the partitioning heuristic  ============================================================
+  def partitionBy(partitionStrategy: PartitionStrategy): Graph[VD, ED]
+  // Transform vertex and edge attributes ==========================================================
+  def mapVertices[VD2](map: (VertexID, VD) => VD2): Graph[VD2, ED]
+  def mapEdges[ED2](map: Edge[ED] => ED2): Graph[VD, ED2]
+  def mapEdges[ED2](map: (PartitionID, Iterator[Edge[ED]]) => Iterator[ED2]): Graph[VD, ED2]
+  def mapTriplets[ED2](map: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2]
+  def mapTriplets[ED2](map: (PartitionID, Iterator[EdgeTriplet[VD, ED]]) => Iterator[ED2])
+    : Graph[VD, ED2]
+  // Modify the graph structure ====================================================================
+  def reverse: Graph[VD, ED]
+  def subgraph(
+      epred: EdgeTriplet[VD,ED] => Boolean = (x => true),
+      vpred: (VertexID, VD) => Boolean = ((v, d) => true))
+    : Graph[VD, ED]
+  def mask[VD2, ED2](other: Graph[VD2, ED2]): Graph[VD, ED]
+  def groupEdges(merge: (ED, ED) => ED): Graph[VD, ED]
+  // Join RDDs with the graph ======================================================================
+  def joinVertices[U](table: RDD[(VertexID, U)])(mapFunc: (VertexID, VD, U) => VD): Graph[VD, ED]
+  def outerJoinVertices[U, VD2](other: RDD[(VertexID, U)])
+      (mapFunc: (VertexID, VD, Option[U]) => VD2)
+    : Graph[VD2, ED]
+  // Aggregate information about adjacent triplets =================================================
+  def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexID]]
+  def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[Array[(VertexID, VD)]]
+  def mapReduceTriplets[A: ClassTag](
+      mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexID, A)],
+      reduceFunc: (A, A) => A,
+      activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None)
+    : VertexRDD[A]
+  // Iterative graph-parallel computation ==========================================================
+  def pregel[A](initialMsg: A, maxIterations: Int, activeDirection: EdgeDirection)(
+      vprog: (VertexID, VD, A) => VD,
+      sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexID,A)],
+      mergeMsg: (A, A) => A)
+    : Graph[VD, ED]
+  // Basic graph algorithms ========================================================================
+  def pageRank(tol: Double, resetProb: Double = 0.15): Graph[Double, Double]
+  def connectedComponents(): Graph[VertexID, ED]
+  def triangleCount(): Graph[Int, ED]
+  def stronglyConnectedComponents(numIter: Int): Graph[VertexID, ED]
+}
+{% endhighlight %}
+
+
 ## Property Operators
 
 In direct analogy to the RDD `map` operator, the property
@@ -273,7 +360,7 @@ graph contains the following:
 
 {% highlight scala %}
 class Graph[VD, ED] {
-  def mapVertices[VD2](map: (VertexID, VD) => VD2): Graph[VD2, ED]
+  def mapVertices[VD2](map: (VertexId, VD) => VD2): Graph[VD2, ED]
   def mapEdges[ED2](map: Edge[ED] => ED2): Graph[VD, ED2]
   def mapTriplets[ED2](map: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2]
 }
@@ -295,7 +382,7 @@ val newGraph = Graph(newVertices, graph.edges)
 val newGraph = graph.mapVertices((id, attr) => mapUdf(id, attr))
 {% endhighlight %}
 
-[Graph.mapVertices]: api/graphx/index.html#org.apache.spark.graphx.Graph@mapVertices[VD2]((VertexID,VD)⇒VD2)(ClassTag[VD2]):Graph[VD2,ED]
+[Graph.mapVertices]: api/graphx/index.html#org.apache.spark.graphx.Graph@mapVertices[VD2]((VertexId,VD)⇒VD2)(ClassTag[VD2]):Graph[VD2,ED]
 
 These operators are often used to initialize the graph for a particular computation or project away
 unnecessary properties.  For example, given a graph with the out-degrees as the vertex properties
@@ -321,7 +408,7 @@ add more in the future.  The following is a list of the basic structural operato
 class Graph[VD, ED] {
   def reverse: Graph[VD, ED]
   def subgraph(epred: EdgeTriplet[VD,ED] => Boolean,
-               vpred: (VertexID, VD) => Boolean): Graph[VD, ED]
+               vpred: (VertexId, VD) => Boolean): Graph[VD, ED]
   def mask[VD2, ED2](other: Graph[VD2, ED2]): Graph[VD, ED]
   def groupEdges(merge: (ED, ED) => ED): Graph[VD,ED]
 }
@@ -340,11 +427,11 @@ satisfy the edge predicate *and connect vertices that satisfy the vertex predica
 operator can be used in number of situations to restrict the graph to the vertices and edges of
 interest or eliminate broken links. For example in the following code we remove broken links:
 
-[Graph.subgraph]: api/graphx/index.html#org.apache.spark.graphx.Graph@subgraph((EdgeTriplet[VD,ED])⇒Boolean,(VertexID,VD)⇒Boolean):Graph[VD,ED]
+[Graph.subgraph]: api/graphx/index.html#org.apache.spark.graphx.Graph@subgraph((EdgeTriplet[VD,ED])⇒Boolean,(VertexId,VD)⇒Boolean):Graph[VD,ED]
 
 {% highlight scala %}
 // Create an RDD for the vertices
-val users: RDD[(VertexID, (String, String))] =
+val users: RDD[(VertexId, (String, String))] =
   sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
                        (5L, ("franklin", "prof")), (2L, ("istoica", "prof")),
                        (4L, ("peter", "student"))))
@@ -407,9 +494,9 @@ using the *join* operators. Below we list the key join operators:
 
 {% highlight scala %}
 class Graph[VD, ED] {
-  def joinVertices[U](table: RDD[(VertexID, U)])(map: (VertexID, VD, U) => VD)
+  def joinVertices[U](table: RDD[(VertexId, U)])(map: (VertexId, VD, U) => VD)
     : Graph[VD, ED]
-  def outerJoinVertices[U, VD2](table: RDD[(VertexID, U)])(map: (VertexID, VD, Option[U]) => VD2)
+  def outerJoinVertices[U, VD2](table: RDD[(VertexId, U)])(map: (VertexId, VD, Option[U]) => VD2)
     : Graph[VD2, ED]
 }
 {% endhighlight %}
@@ -419,13 +506,13 @@ returns a new graph with the vertex properties obtained by applying the user def
 to the result of the joined vertices.  Vertices without a matching value in the RDD retain their
 original value.
 
-[GraphOps.joinVertices]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@joinVertices[U](RDD[(VertexID,U)])((VertexID,VD,U)⇒VD)(ClassTag[U]):Graph[VD,ED]
+[GraphOps.joinVertices]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@joinVertices[U](RDD[(VertexId,U)])((VertexId,VD,U)⇒VD)(ClassTag[U]):Graph[VD,ED]
 
 > Note that if the RDD contains more than one value for a given vertex only one will be used.   It
 > is therefore recommended that the input RDD be first made unique using the following which will
 > also *pre-index* the resulting values to substantially accelerate the subsequent join.
 > {% highlight scala %}
-val nonUniqueCosts: RDD[(VertexId, Double)]
+val nonUniqueCosts: RDD[(VertexID, Double)]
 val uniqueCosts: VertexRDD[Double] =
   graph.vertices.aggregateUsingIndex(nonUnique, (a,b) => a + b)
 val joinedGraph = graph.joinVertices(uniqueCosts)(
@@ -438,7 +525,7 @@ property type.  Because not all vertices may have a matching value in the input
 function takes an `Option` type.  For example, we can setup a graph for PageRank by initializing
 vertex properties with their `outDegree`.
 
-[Graph.outerJoinVertices]: api/graphx/index.html#org.apache.spark.graphx.Graph@outerJoinVertices[U,VD2](RDD[(VertexID,U)])((VertexID,VD,Option[U])⇒VD2)(ClassTag[U],ClassTag[VD2]):Graph[VD2,ED]
+[Graph.outerJoinVertices]: api/graphx/index.html#org.apache.spark.graphx.Graph@outerJoinVertices[U,VD2](RDD[(VertexId,U)])((VertexId,VD,Option[U])⇒VD2)(ClassTag[U],ClassTag[VD2]):Graph[VD2,ED]
 
 
 {% highlight scala %}
@@ -457,7 +544,7 @@ val degreeGraph = graph.outerJoinVertices(outDegrees) { (id, oldAttr, outDegOpt)
 > provide type annotation for the user defined function:
 > {% highlight scala %}
 val joinedGraph = graph.joinVertices(uniqueCosts,
-  (id: VertexId, oldCost: Double, extraCost: Double) => oldCost + extraCost)
+  (id: VertexID, oldCost: Double, extraCost: Double) => oldCost + extraCost)
 {% endhighlight %}
 
 
@@ -472,7 +559,7 @@ PageRank Value, shortest path to the source, and smallest reachable vertex id).
 ### Map Reduce Triplets (mapReduceTriplets)
 <a name="mrTriplets"></a>
 
-[Graph.mapReduceTriplets]: api/graphx/index.html#org.apache.spark.graphx.Graph@mapReduceTriplets[A](mapFunc:org.apache.spark.graphx.EdgeTriplet[VD,ED]=&gt;Iterator[(org.apache.spark.graphx.VertexID,A)],reduceFunc:(A,A)=&gt;A,activeSetOpt:Option[(org.apache.spark.graphx.VertexRDD[_],org.apache.spark.graphx.EdgeDirection)])(implicitevidence$10:scala.reflect.ClassTag[A]):org.apache.spark.graphx.VertexRDD[A]
+[Graph.mapReduceTriplets]: api/graphx/index.html#org.apache.spark.graphx.Graph@mapReduceTriplets[A](mapFunc:org.apache.spark.graphx.EdgeTriplet[VD,ED]=&gt;Iterator[(org.apache.spark.graphx.VertexId,A)],reduceFunc:(A,A)=&gt;A,activeSetOpt:Option[(org.apache.spark.graphx.VertexRDD[_],org.apache.spark.graphx.EdgeDirection)])(implicitevidence$10:scala.reflect.ClassTag[A]):org.apache.spark.graphx.VertexRDD[A]
 
 The core (heavily optimized) aggregation primitive in GraphX is the
 [`mapReduceTriplets`][Graph.mapReduceTriplets] operator:
@@ -480,7 +567,7 @@ The core (heavily optimized) aggregation primitive in GraphX is the
 {% highlight scala %}
 class Graph[VD, ED] {
   def mapReduceTriplets[A](
-      map: EdgeTriplet[VD, ED] => Iterator[(VertexID, A)],
+      map: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)],
       reduce: (A, A) => A)
     : VertexRDD[A]
 }
@@ -495,26 +582,26 @@ containing the aggregate message (of type `A`) destined to each vertex.  Vertice
 receive a message are not included in the returned `VertexRDD`.
 
 <blockquote>
-<p>
-Note that <code>mapReduceTriplets</code> takes an additional optional <code>activeSet</code>
-(see API docs) which restricts the map phase to edges adjacent to the vertices in the provided
-<code>VertexRDD</code>:
-</p>
+
+<p>Note that <code>mapReduceTriplets</code> takes an additional optional <code>activeSet</code>
+(not shown above see API docs for details) which restricts the map phase to edges adjacent to the
+vertices in the provided <code>VertexRDD</code>: </p>
+
 {% highlight scala %}
   activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None
 {% endhighlight %}
-<p>
-The EdgeDirection specifies which edges adjacent to the vertex set are included in the map phase. If
-the direction is <code>In</code>, <code>mapFunc</code> will only be run only on edges with
-destination in the active set. If the direction is <code>Out</code>, <code>mapFunc</code> will only
-be run only on edges originating from vertices in the active set.  If the direction is
-<code>Either</code>, <code>mapFunc</code> will be run only on edges with <i>either</i> vertex in the
-active set.  If the direction is <code>Both</code>, <code>mapFunc</code> will be run only on edges
-with both vertices in the active set.  The active set must be derived from the set of vertices in
-the graph. Restricting computation to triplets adjacent to a subset of the vertices is often
-necessary in incremental iterative computation and is a key part of the GraphX implementation of
-Pregel.
-</p>
+
+<p>The EdgeDirection specifies which edges adjacent to the vertex set are included in the map
+phase. If the direction is <code>In</code>, then the user defined <code>map</code> function will
+only be run only on edges with the destination vertex in the active set. If the direction is
+<code>Out</code>, then the <code>map</code> function will only be run only on edges originating from
+vertices in the active set.  If the direction is <code>Either</code>, then the <code>map</code>
+function will be run only on edges with <i>either</i> vertex in the active set.  If the direction is
+<code>Both</code>, then the <code>map</code> function will be run only on edges with both vertices
+in the active set.  The active set must be derived from the set of vertices in the graph.
+Restricting computation to triplets adjacent to a subset of the vertices is often necessary in
+incremental iterative computation and is a key part of the GraphX implementation of Pregel. </p>
+
 </blockquote>
 
 In the following example we use the `mapReduceTriplets` operator to compute the average age of the
@@ -547,8 +634,8 @@ val avgAgeOfOlderFollowers: VertexRDD[Double] =
 avgAgeOfOlderFollowers.collect.foreach(println(_))
 {% endhighlight %}
 
-> Note that the `mapReduceTriplets` operation performs optimally when the messages (and their sums)
-> are constant sized (e.g., floats and addition instead of lists and concatenation).  More
+> Note that the `mapReduceTriplets` operation performs optimally when the messages (and the sums of
+> messages) are constant sized (e.g., floats and addition instead of lists and concatenation).  More
 > precisely, the result of `mapReduceTriplets` should ideally be sub-linear in the degree of each
 > vertex.
 
@@ -562,13 +649,13 @@ compute the max in, out, and total degrees:
 
 {% highlight scala %}
 // Define a reduce operation to compute the highest degree vertex
-def max(a: (VertexID, Int), b: (VertexID, Int)): (VertexID, Int) = {
+def max(a: (VertexId, Int), b: (VertexId, Int)): (VertexId, Int) = {
   if (a._2 > b._2) a else b
 }
 // Compute the max degrees
-val maxInDegree: (VertexID, Int)  = graph.inDegrees.reduce(max)
-val maxOutDegree: (VertexID, Int) = graph.outDegrees.reduce(max)
-val maxDegrees: (VertexID, Int)   = graph.degrees.reduce(max)
+val maxInDegree: (VertexId, Int)  = graph.inDegrees.reduce(max)
+val maxOutDegree: (VertexId, Int) = graph.outDegrees.reduce(max)
+val maxDegrees: (VertexId, Int)   = graph.degrees.reduce(max)
 {% endhighlight %}
 
 ### Collecting Neighbors
@@ -578,14 +665,14 @@ attributes at each vertex. This can be easily accomplished using the
 [`collectNeighborIds`][GraphOps.collectNeighborIds] and the
 [`collectNeighbors`][GraphOps.collectNeighbors] operators.
 
-[GraphOps.collectNeighborIds]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@collectNeighborIds(EdgeDirection):VertexRDD[Array[VertexID]]
-[GraphOps.collectNeighbors]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@collectNeighbors(EdgeDirection):VertexRDD[Array[(VertexID,VD)]]
+[GraphOps.collectNeighborIds]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@collectNeighborIds(EdgeDirection):VertexRDD[Array[VertexId]]
+[GraphOps.collectNeighbors]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@collectNeighbors(EdgeDirection):VertexRDD[Array[(VertexId,VD)]]
 
 
 {% highlight scala %}
 class GraphOps[VD, ED] {
-  def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexID]]
-  def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[ Array[(VertexID, VD)] ]
+  def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexId]]
+  def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[ Array[(VertexId, VD)] ]
 }
 {% endhighlight %}
 
@@ -593,11 +680,20 @@ class GraphOps[VD, ED] {
 > substantial communication.  If possible try expressing the same computation using the
 > `mapReduceTriplets` operator directly.
 
+## Caching and Uncaching
+
+In Spark, RDDs are not persisted in memory by default. To avoid recomputation, they must be explicitly cached when using them multiple times (see the [Spark Programming Guide][RDD Persistence]). Graphs in GraphX behave the same way. **When using a graph multiple times, make sure to call [`Graph.cache()`][Graph.cache] on it first.**
+
+[RDD Persistence]: scala-programming-guide.html#rdd-persistence
+[Graph.cache]: api/graphx/index.html#org.apache.spark.graphx.Graph@cache():Graph[VD,ED]
+
+In iterative computations, *uncaching* may also be necessary for best performance. By default, cached RDDs and graphs will remain in memory until memory pressure forces them to be evicted in LRU order. For iterative computation, intermediate results from previous iterations will fill up the cache. Though they will eventually be evicted, the unnecessary data stored in memory will slow down garbage collection. It would be more efficient to uncache intermediate results as soon as they are no longer necessary. This involves materializing (caching and forcing) a graph or RDD every iteration, uncaching all other datasets, and only using the materialized dataset in future iterations. However, because graphs are composed of multiple RDDs, it can be difficult to unpersist them correctly. **For iterative computation we recommend using the Pregel API, which correctly unpersists intermediate results.**
+
 # Pregel API
 <a name="pregel"></a>
 
 Graphs are inherently recursive data-structures as properties of vertices depend on properties of
-their neighbors which intern depend on properties of *their* neighbors.  As a
+their neighbors which in turn depend on properties of *their* neighbors.  As a
 consequence many important graph algorithms iteratively recompute the properties of each vertex
 until a fixed-point condition is reached.  A range of graph-parallel abstractions have been proposed
 to express these iterative algorithms.  GraphX exposes a Pregel-like operator which is a fusion of
@@ -620,7 +716,7 @@ messages remaining.
 The following is the type signature of the [Pregel operator][GraphOps.pregel] as well as a *sketch*
 of its implementation (note calls to graph.cache have been removed):
 
-[GraphOps.pregel]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@pregel[A](A,Int,EdgeDirection)((VertexID,VD,A)⇒VD,(EdgeTriplet[VD,ED])⇒Iterator[(VertexID,A)],(A,A)⇒A)(ClassTag[A]):Graph[VD,ED]
+[GraphOps.pregel]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@pregel[A](A,Int,EdgeDirection)((VertexId,VD,A)⇒VD,(EdgeTriplet[VD,ED])⇒Iterator[(VertexId,A)],(A,A)⇒A)(ClassTag[A]):Graph[VD,ED]
 
 {% highlight scala %}
 class GraphOps[VD, ED] {
@@ -628,8 +724,8 @@ class GraphOps[VD, ED] {
       (initialMsg: A,
        maxIter: Int = Int.MaxValue,
        activeDir: EdgeDirection = EdgeDirection.Out)
-      (vprog: (VertexID, VD, A) => VD,
-       sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexID, A)],
+      (vprog: (VertexId, VD, A) => VD,
+       sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)],
        mergeMsg: (A, A) => A)
     : Graph[VD, ED] = {
     // Receive the initial message at each vertex
@@ -674,7 +770,7 @@ import org.apache.spark.graphx.util.GraphGenerators
 // A graph with edge attributes containing distances
 val graph: Graph[Int, Double] =
   GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble)
-val sourceId: VertexID = 42 // The ultimate source
+val sourceId: VertexId = 42 // The ultimate source
 // Initialize the graph such that all vertices except the root have distance infinity.
 val initialGraph = graph.mapVertices((id, _) => if (id == sourceId) 0.0 else Double.PositiveInfinity)
 val sssp = initialGraph.pregel(Double.PositiveInfinity)(
@@ -721,7 +817,7 @@ It creates a `Graph` from the specified edges, automatically creating any vertic
 {% highlight scala %}
 object Graph {
   def apply[VD, ED](
-      vertices: RDD[(VertexID, VD)],
+      vertices: RDD[(VertexId, VD)],
       edges: RDD[Edge[ED]],
       defaultVertexAttr: VD = null)
     : Graph[VD, ED]
@@ -731,7 +827,7 @@ object Graph {
       defaultValue: VD): Graph[VD, ED]
 
   def fromEdgeTuples[VD](
-      rawEdges: RDD[(VertexID, VertexID)],
+      rawEdges: RDD[(VertexId, VertexId)],
       defaultValue: VD,
       uniqueEdges: Option[PartitionStrategy] = None): Graph[VD, Int]
 
@@ -747,8 +843,8 @@ object Graph {
 [PartitionStrategy]: api/graphx/index.html#org.apache.spark.graphx.PartitionStrategy$
 
 [GraphLoader.edgeListFile]: api/graphx/index.html#org.apache.spark.graphx.GraphLoader$@edgeListFile(SparkContext,String,Boolean,Int):Graph[Int,Int]
-[Graph.apply]: api/graphx/index.html#org.apache.spark.graphx.Graph$@apply[VD,ED](RDD[(VertexID,VD)],RDD[Edge[ED]],VD)(ClassTag[VD],ClassTag[ED]):Graph[VD,ED]
-[Graph.fromEdgeTuples]: api/graphx/index.html#org.apache.spark.graphx.Graph$@fromEdgeTuples[VD](RDD[(VertexID,VertexID)],VD,Option[PartitionStrategy])(ClassTag[VD]):Graph[VD,Int]
+[Graph.apply]: api/graphx/index.html#org.apache.spark.graphx.Graph$@apply[VD,ED](RDD[(VertexId,VD)],RDD[Edge[ED]],VD)(ClassTag[VD],ClassTag[ED]):Graph[VD,ED]
+[Graph.fromEdgeTuples]: api/graphx/index.html#org.apache.spark.graphx.Graph$@fromEdgeTuples[VD](RDD[(VertexId,VertexId)],VD,Option[PartitionStrategy])(ClassTag[VD]):Graph[VD,Int]
 [Graph.fromEdges]: api/graphx/index.html#org.apache.spark.graphx.Graph$@fromEdges[VD,ED](RDD[Edge[ED]],VD)(ClassTag[VD],ClassTag[ED]):Graph[VD,ED]
 
 # Vertex and Edge RDDs
@@ -761,47 +857,46 @@ respectively.  In this section we review some of the additional useful functiona
 
 ## VertexRDDs
 
-The `VertexRDD[A]` extends the more traditional `RDD[(VertexId, A)]` but adds the additional
-constraint that each `VertexId` occurs only *once*.  Moreover, `VertexRDD[A]` represents a *set* of
-vertices each with an attribute of type `A`.  Internally, this is achieved by storing the vertex
-attributes in a reusable hash-map data-structure.  As a consequence if two `VertexRDD`s are derived
-from the same base `VertexRDD` (e.g., by `filter` or `mapValues`) they can be joined in constant
-time without hash evaluations. To leverage this indexed data-structure, the `VertexRDD` exposes the
-following additional functionality:
+The `VertexRDD[A]` extends `RDD[(VertexID, A)]` and adds the additional constraint that each
+`VertexID` occurs only *once*.  Moreover, `VertexRDD[A]` represents a *set* of vertices each with an
+attribute of type `A`.  Internally, this is achieved by storing the vertex attributes in a reusable
+hash-map data-structure.  As a consequence if two `VertexRDD`s are derived from the same base
+`VertexRDD` (e.g., by `filter` or `mapValues`) they can be joined in constant time without hash
+evaluations. To leverage this indexed data-structure, the `VertexRDD` exposes the following
+additional functionality:
 
 {% highlight scala %}
-class VertexRDD[VD] {
+class VertexRDD[VD] extends RDD[(VertexID, VD)] {
   // Filter the vertex set but preserves the internal index
-  def filter(pred: Tuple2[VertexID, VD] => Boolean): VertexRDD[VD]
+  def filter(pred: Tuple2[VertexId, VD] => Boolean): VertexRDD[VD]
   // Transform the values without changing the ids (preserves the internal index)
   def mapValues[VD2](map: VD => VD2): VertexRDD[VD2]
-  def mapValues[VD2](map: (VertexID, VD) => VD2): VertexRDD[VD2]
+  def mapValues[VD2](map: (VertexId, VD) => VD2): VertexRDD[VD2]
   // Remove vertices from this set that appear in the other set
   def diff(other: VertexRDD[VD]): VertexRDD[VD]
   // Join operators that take advantage of the internal indexing to accelerate joins (substantially)
-  def leftJoin[VD2, VD3](other: RDD[(VertexID, VD2)])(f: (VertexID, VD, Option[VD2]) => VD3): VertexRDD[VD3]
-  def innerJoin[U, VD2](other: RDD[(VertexID, U)])(f: (VertexID, VD, U) => VD2): VertexRDD[VD2]
+  def leftJoin[VD2, VD3](other: RDD[(VertexId, VD2)])(f: (VertexId, VD, Option[VD2]) => VD3): VertexRDD[VD3]
+  def innerJoin[U, VD2](other: RDD[(VertexId, U)])(f: (VertexId, VD, U) => VD2): VertexRDD[VD2]
   // Use the index on this RDD to accelerate a `reduceByKey` operation on the input RDD.
-  def aggregateUsingIndex[VD2](other: RDD[(VertexID, VD2)], reduceFunc: (VD2, VD2) => VD2): VertexRDD[VD2]
+  def aggregateUsingIndex[VD2](other: RDD[(VertexId, VD2)], reduceFunc: (VD2, VD2) => VD2): VertexRDD[VD2]
 }
 {% endhighlight %}
 
 Notice, for example,  how the `filter` operator returns an `VertexRDD`.  Filter is actually
 implemented using a `BitSet` thereby reusing the index and preserving the ability to do fast joins
 with other `VertexRDD`s.  Likewise, the `mapValues` operators do not allow the `map` function to
-change the `VertexId` thereby enabling the same `HashMap` data-structures to be reused.  Both the
+change the `VertexID` thereby enabling the same `HashMap` data-structures to be reused.  Both the
 `leftJoin` and `innerJoin` are able to identify when joining two `VertexRDD`s derived from the same
 `HashMap` and implement the join by linear scan rather than costly point lookups.
 
-The `aggregateUsingIndex` operator can be slightly confusing but is also useful for efficient
-construction of a new `VertexRDD` from an `RDD[(VertexId, A)]`.  Conceptually, if I have constructed
-a `VertexRDD[B]` over a set of vertices, *which is a super-set* of the vertices in some
-`RDD[(VertexId, A)]` then I can reuse the index to both aggregate and then subsequently index the
-RDD.  For example:
+The `aggregateUsingIndex` operator is useful for efficient construction of a new `VertexRDD` from an
+`RDD[(VertexID, A)]`.  Conceptually, if I have constructed a `VertexRDD[B]` over a set of vertices,
+*which is a super-set* of the vertices in some `RDD[(VertexID, A)]` then I can reuse the index to
+both aggregate and then subsequently index the `RDD[(VertexID, A)]`.  For example:
 
 {% highlight scala %}
 val setA: VertexRDD[Int] = VertexRDD(sc.parallelize(0L until 100L).map(id => (id, 1)))
-val rddB: RDD[(VertexID, Double)] = sc.parallelize(0L until 100L).flatMap(id => List((id, 1.0), (id, 2.0)))
+val rddB: RDD[(VertexId, Double)] = sc.parallelize(0L until 100L).flatMap(id => List((id, 1.0), (id, 2.0)))
 // There should be 200 entries in rddB
 rddB.count
 val setB: VertexRDD[Double] = setA.aggregateUsingIndex(rddB, _ + _)
@@ -813,10 +908,10 @@ val setC: VertexRDD[Double] = setA.innerJoin(setB)((id, a, b) => a + b)
 
 ## EdgeRDDs
 
-The `EdgeRDD[ED]`, which extends `RDD[Edge[ED]]` is considerably simpler than the `VertexRDD`.
-GraphX organizes the edges in blocks partitioned using one of the various partitioning strategies
-defined in [`PartitionStrategy`][PartitionStrategy].  Within each partition, edge attributes and
-adjacency structure, are stored separately enabling maximum reuse when changing attribute values.
+The `EdgeRDD[ED]`, which extends `RDD[Edge[ED]]` organizes the edges in blocks partitioned using one
+of the various partitioning strategies defined in [`PartitionStrategy`][PartitionStrategy].  Within
+each partition, edge attributes and adjacency structure, are stored separately enabling maximum
+reuse when changing attribute values.
 
 [PartitionStrategy]: api/graphx/index.html#org.apache.spark.graphx.PartitionStrategy
 
@@ -827,11 +922,11 @@ def mapValues[ED2](f: Edge[ED] => ED2): EdgeRDD[ED2]
 // Revere the edges reusing both attributes and structure
 def reverse: EdgeRDD[ED]
 // Join two `EdgeRDD`s partitioned using the same partitioning strategy.
-def innerJoin[ED2, ED3](other: EdgeRDD[ED2])(f: (VertexID, VertexID, ED, ED2) => ED3): EdgeRDD[ED3]
+def innerJoin[ED2, ED3](other: EdgeRDD[ED2])(f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDD[ED3]
 {% endhighlight %}
 
 In most applications we have found that operations on the `EdgeRDD` are accomplished through the
-graph or rely on operations defined in the base `RDD` class.
+graph operators or rely on operations defined in the base `RDD` class.
 
 # Optimized Representation
 
@@ -853,7 +948,9 @@ reduce both the communication and storage overhead.  Logically, this corresponds
 to machines and allowing vertices to span multiple machines.  The exact method of assigning edges
 depends on the [`PartitionStrategy`][PartitionStrategy] and there are several tradeoffs to the
 various heuristics.  Users can choose between different strategies by repartitioning the graph with
-the [`Graph.partitionBy`][Graph.partitionBy] operator.
+the [`Graph.partitionBy`][Graph.partitionBy] operator.  The default partitioning strategy is to use
+the initial partitioning of the edges as provided on graph construction.  However, users can easily
+switch to 2D-partitioning or other heuristics included in GraphX.
 
 [Graph.partitionBy]: api/graphx/index.html#org.apache.spark.graphx.Graph$@partitionBy(partitionStrategy:org.apache.spark.graphx.PartitionStrategy):org.apache.spark.graphx.Graph[VD,ED]
 
@@ -867,16 +964,15 @@ the [`Graph.partitionBy`][Graph.partitionBy] operator.
 
 Once the edges have be partitioned the key challenge to efficient graph-parallel computation is
 efficiently joining vertex attributes with the edges.  Because real-world graphs typically have more
-edges than vertices, we move vertex attributes to the edges.
-
-
-
-
+edges than vertices, we move vertex attributes to the edges.  Because not all partitions will
+contain edges adjacent to all vertices we internally maintain a routing table which identifies where
+to broadcast vertices when implementing the join required for operations like `triplets` and
+`mapReduceTriplets`.
 
 # Graph Algorithms
 <a name="graph_algorithms"></a>
 
-GraphX includes a set of graph algorithms in to simplify analytics. The algorithms are contained in the `org.apache.spark.graphx.lib` package and can be accessed directly as methods on `Graph` via [`GraphOps`][GraphOps]. This section describes the algorithms and how they are used.
+GraphX includes a set of graph algorithms to simplify analytics tasks. The algorithms are contained in the `org.apache.spark.graphx.lib` package and can be accessed directly as methods on `Graph` via [`GraphOps`][GraphOps]. This section describes the algorithms and how they are used.
 
 ## PageRank
 <a name="pagerank"></a>
@@ -953,13 +1049,6 @@ val triCountByUsername = users.join(triCounts).map { case (id, (username, tc)) =
 println(triCountByUsername.collect().mkString("\n"))
 {% endhighlight %}
 
-<p style="text-align: center;">
-  <img src="img/tables_and_graphs.png"
-       title="Tables and Graphs"
-       alt="Tables and Graphs"
-       width="50%" />
-  <!-- Images are downsized intentionally to improve quality on retina displays -->
-</p>
 
 # Examples
 
diff --git a/docs/img/java-sm.png b/docs/img/java-sm.png
new file mode 100644
index 0000000000000..a82ee7d682e49
Binary files /dev/null and b/docs/img/java-sm.png differ
diff --git a/docs/img/python-sm.png b/docs/img/python-sm.png
new file mode 100644
index 0000000000000..ae01e05252abd
Binary files /dev/null and b/docs/img/python-sm.png differ
diff --git a/docs/img/scala-sm.png b/docs/img/scala-sm.png
new file mode 100644
index 0000000000000..30db034b70cf9
Binary files /dev/null and b/docs/img/scala-sm.png differ
diff --git a/docs/img/streaming-arch.png b/docs/img/streaming-arch.png
new file mode 100644
index 0000000000000..bc57b460fdf8b
Binary files /dev/null and b/docs/img/streaming-arch.png differ
diff --git a/docs/img/streaming-dstream-ops.png b/docs/img/streaming-dstream-ops.png
new file mode 100644
index 0000000000000..a1c5634aa3c3a
Binary files /dev/null and b/docs/img/streaming-dstream-ops.png differ
diff --git a/docs/img/streaming-dstream-window.png b/docs/img/streaming-dstream-window.png
new file mode 100644
index 0000000000000..276d2fee5e30e
Binary files /dev/null and b/docs/img/streaming-dstream-window.png differ
diff --git a/docs/img/streaming-dstream.png b/docs/img/streaming-dstream.png
new file mode 100644
index 0000000000000..90f43b8c7138c
Binary files /dev/null and b/docs/img/streaming-dstream.png differ
diff --git a/docs/img/streaming-figures.pptx b/docs/img/streaming-figures.pptx
new file mode 100644
index 0000000000000..1b18c2ee0ea3e
Binary files /dev/null and b/docs/img/streaming-figures.pptx differ
diff --git a/docs/img/streaming-flow.png b/docs/img/streaming-flow.png
new file mode 100644
index 0000000000000..a870cb9b1839b
Binary files /dev/null and b/docs/img/streaming-flow.png differ
diff --git a/docs/index.md b/docs/index.md
index debdb33108676..4eb297df39144 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -9,7 +9,7 @@ It also supports a rich set of higher-level tools including [Shark](http://shark
 
 # Downloading
 
-Get Spark by visiting the [downloads page](http://spark.incubator.apache.org/downloads.html) of the Apache Spark site. This documentation is for Spark version {{site.SPARK_VERSION}}.
+Get Spark by visiting the [downloads page](http://spark.apache.org/downloads.html) of the Apache Spark site. This documentation is for Spark version {{site.SPARK_VERSION}}.
 
 Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS). All you need to run it is to have `java` to installed on your system `PATH`, or the `JAVA_HOME` environment variable pointing to a Java installation.
 
@@ -19,7 +19,7 @@ Spark uses [Simple Build Tool](http://www.scala-sbt.org), which is bundled with
 
     sbt/sbt assembly
 
-For its Scala API, Spark {{site.SPARK_VERSION}} depends on Scala {{site.SCALA_VERSION}}. If you write applications in Scala, you will need to use this same version of Scala in your own program -- newer major versions may not work. You can get the right version of Scala from [scala-lang.org](http://www.scala-lang.org/download/).
+For its Scala API, Spark {{site.SPARK_VERSION}} depends on Scala {{site.SCALA_BINARY_VERSION}}. If you write applications in Scala, you will need to use a compatible Scala version (e.g. {{site.SCALA_BINARY_VERSION}}.X) -- newer major versions may not work. You can get the right version of Scala from [scala-lang.org](http://www.scala-lang.org/download/).
 
 # Running the Examples and Shell
 
@@ -75,7 +75,7 @@ For this version of Spark (0.8.1) Hadoop 2.2.x (or newer) users will have to bui
 * [Spark Programming Guide](scala-programming-guide.html): an overview of Spark concepts, and details on the Scala API
   * [Java Programming Guide](java-programming-guide.html): using Spark from Java
   * [Python Programming Guide](python-programming-guide.html): using Spark from Python
-* [Spark Streaming](streaming-programming-guide.html): using the alpha release of Spark Streaming
+* [Spark Streaming](streaming-programming-guide.html): Spark's API for processing data streams
 * [MLlib (Machine Learning)](mllib-guide.html): Spark's built-in machine learning library
 * [Bagel (Pregel on Spark)](bagel-programming-guide.html): simple graph processing model
 * [GraphX (Graphs on Spark)](graphx-programming-guide.html): Spark's new API for graphs
@@ -96,7 +96,7 @@ For this version of Spark (0.8.1) Hadoop 2.2.x (or newer) users will have to bui
 * [Amazon EC2](ec2-scripts.html): scripts that let you launch a cluster on EC2 in about 5 minutes
 * [Standalone Deploy Mode](spark-standalone.html): launch a standalone cluster quickly without a third-party cluster manager
 * [Mesos](running-on-mesos.html): deploy a private cluster using
-    [Apache Mesos](http://incubator.apache.org/mesos)
+    [Apache Mesos](http://mesos.apache.org)
 * [YARN](running-on-yarn.html): deploy Spark on top of Hadoop NextGen (YARN)
 
 **Other documents:**
@@ -110,20 +110,20 @@ For this version of Spark (0.8.1) Hadoop 2.2.x (or newer) users will have to bui
 
 **External resources:**
 
-* [Spark Homepage](http://spark.incubator.apache.org)
+* [Spark Homepage](http://spark.apache.org)
 * [Shark](http://shark.cs.berkeley.edu): Apache Hive over Spark
-* [Mailing Lists](http://spark.incubator.apache.org/mailing-lists.html): ask questions about Spark here
+* [Mailing Lists](http://spark.apache.org/mailing-lists.html): ask questions about Spark here
 * [AMP Camps](http://ampcamp.berkeley.edu/): a series of training camps at UC Berkeley that featured talks and
   exercises about Spark, Shark, Mesos, and more. [Videos](http://ampcamp.berkeley.edu/agenda-2012),
   [slides](http://ampcamp.berkeley.edu/agenda-2012) and [exercises](http://ampcamp.berkeley.edu/exercises-2012) are
   available online for free.
-* [Code Examples](http://spark.incubator.apache.org/examples.html): more are also available in the [examples subfolder](https://github.com/apache/incubator-spark/tree/master/examples/src/main/scala/) of Spark
+* [Code Examples](http://spark.apache.org/examples.html): more are also available in the [examples subfolder](https://github.com/apache/spark/tree/master/examples/src/main/scala/) of Spark
 * [Paper Describing Spark](http://www.cs.berkeley.edu/~matei/papers/2012/nsdi_spark.pdf)
 * [Paper Describing Spark Streaming](http://www.eecs.berkeley.edu/Pubs/TechRpts/2012/EECS-2012-259.pdf)
 
 # Community
 
-To get help using Spark or keep up with Spark development, sign up for the [user mailing list](http://spark.incubator.apache.org/mailing-lists.html).
+To get help using Spark or keep up with Spark development, sign up for the [user mailing list](http://spark.apache.org/mailing-lists.html).
 
 If you're in the San Francisco Bay Area, there's a regular [Spark meetup](http://www.meetup.com/spark-users/) every few weeks. Come by to meet the developers and other users.
 
diff --git a/docs/java-programming-guide.md b/docs/java-programming-guide.md
index 07732fa1229f3..5c73dbb25ede8 100644
--- a/docs/java-programming-guide.md
+++ b/docs/java-programming-guide.md
@@ -189,7 +189,7 @@ We hope to generate documentation with Java-style syntax in the future.
 # Where to Go from Here
 
 Spark includes several sample programs using the Java API in
-[`examples/src/main/java`](https://github.com/apache/incubator-spark/tree/master/examples/src/main/java/org/apache/spark/examples).  You can run them by passing the class name to the
+[`examples/src/main/java`](https://github.com/apache/spark/tree/master/examples/src/main/java/org/apache/spark/examples).  You can run them by passing the class name to the
 `bin/run-example` script included in Spark; for example:
 
     ./bin/run-example org.apache.spark.examples.JavaWordCount
diff --git a/docs/js/main.js b/docs/js/main.js
index 8b137891791fe..0bd2286cced19 100755
--- a/docs/js/main.js
+++ b/docs/js/main.js
@@ -1 +1,80 @@
+function codeTabs() {
+  var counter = 0;
+  var langImages = {
+    "scala": "img/scala-sm.png",
+    "python": "img/python-sm.png",
+    "java": "img/java-sm.png"
+  };
+  $("div.codetabs").each(function() {
+    $(this).addClass("tab-content");
 
+    // Insert the tab bar
+    var tabBar = $('<ul class="nav nav-tabs" data-tabs="tabs"></ul>');
+    $(this).before(tabBar);
+
+    // Add each code sample to the tab bar:
+    var codeSamples = $(this).children("div");
+    codeSamples.each(function() {
+      $(this).addClass("tab-pane");
+      var lang = $(this).data("lang");
+      var image = $(this).data("image");
+      var notabs = $(this).data("notabs");
+      var capitalizedLang = lang.substr(0, 1).toUpperCase() + lang.substr(1);
+      var id = "tab_" + lang + "_" + counter;
+      $(this).attr("id", id);
+      if (image != null && langImages[lang]) {
+        var buttonLabel = "<img src='" +langImages[lang] + "' alt='" + capitalizedLang + "' />";
+      } else if (notabs == null) {
+        var buttonLabel = "<b>" + capitalizedLang + "</b>";
+      } else {
+        var buttonLabel = ""
+      }
+      tabBar.append(
+        '<li><a class="tab_' + lang + '" href="#' + id + '">' + buttonLabel + '</a></li>'
+      );
+    });
+
+    codeSamples.first().addClass("active");
+    tabBar.children("li").first().addClass("active");
+    counter++;
+  });
+  $("ul.nav-tabs a").click(function (e) {
+    // Toggling a tab should switch all tabs corresponding to the same language
+    // while retaining the scroll position
+    e.preventDefault();
+    var scrollOffset = $(this).offset().top - $(document).scrollTop();
+    $("." + $(this).attr('class')).tab('show');
+    $(document).scrollTop($(this).offset().top - scrollOffset);
+  });
+}
+
+function makeCollapsable(elt, accordionClass, accordionBodyId, title) {
+  $(elt).addClass("accordion-inner");
+  $(elt).wrap('<div class="accordion ' + accordionClass + '"></div>')
+  $(elt).wrap('<div class="accordion-group"></div>')
+  $(elt).wrap('<div id="' + accordionBodyId + '" class="accordion-body collapse"></div>')
+  $(elt).parent().before(
+    '<div class="accordion-heading">' +
+      '<a class="accordion-toggle" data-toggle="collapse" href="#' + accordionBodyId + '">' +
+             title +
+      '</a>' +
+    '</div>'
+  );
+}
+
+function viewSolution() {
+  var counter = 0
+  $("div.solution").each(function() {
+    var id = "solution_" + counter
+    makeCollapsable(this, "", id,
+      '<i class="icon-ok-sign" style="text-decoration: none; color: #0088cc">' +
+      '</i>' + "View Solution");
+    counter++;
+  });
+}
+
+
+$(function() {
+  codeTabs();
+  viewSolution();
+});
diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md
index 1a5c640d10df4..a22a22184b5c6 100644
--- a/docs/mllib-guide.md
+++ b/docs/mllib-guide.md
@@ -21,7 +21,8 @@ depends on native Fortran routines. You may need to install the
 if it is not already present on your nodes. MLlib will throw a linking error if it cannot 
 detect these libraries automatically.
 
-To use MLlib in Python, you will also need [NumPy](http://www.numpy.org) version 1.7 or newer.
+To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.7 or newer
+and Python 2.7.
 
 # Binary Classification
 
diff --git a/docs/monitoring.md b/docs/monitoring.md
index 0d5eb7065e9f0..242318550c127 100644
--- a/docs/monitoring.md
+++ b/docs/monitoring.md
@@ -48,11 +48,22 @@ Each instance can report to zero or more _sinks_. Sinks are contained in the
 
 * `ConsoleSink`: Logs metrics information to the console.
 * `CSVSink`: Exports metrics data to CSV files at regular intervals.
-* `GangliaSink`: Sends metrics to a Ganglia node or multicast group.
 * `JmxSink`: Registers metrics for viewing in a JXM console.
 * `MetricsServlet`: Adds a servlet within the existing Spark UI to serve metrics data as JSON data.
 * `GraphiteSink`: Sends metrics to a Graphite node.
 
+Spark also supports a Ganglia sink which is not included in the default build due to
+licensing restrictions:
+
+* `GangliaSink`: Sends metrics to a Ganglia node or multicast group.
+
+To install the `GangliaSink` you'll need to perform a custom build of Spark. _**Note that
+by embedding this library you will include [LGPL](http://www.gnu.org/copyleft/lesser.html)-licensed 
+code in your Spark package**_. For sbt users, set the 
+`SPARK_GANGLIA_LGPL` environment variable before building. For Maven users, enable 
+the `-Pspark-ganglia-lgpl` profile. In addition to modifying the cluster's Spark build
+user applications will need to link to the `spark-ganglia-lgpl` artifact.
+
 The syntax of the metrics configuration file is defined in an example configuration file, 
 `$SPARK_HOME/conf/metrics.conf.template`.
 
diff --git a/docs/python-programming-guide.md b/docs/python-programming-guide.md
index b07899c2e176d..7c5283fb0b6fb 100644
--- a/docs/python-programming-guide.md
+++ b/docs/python-programming-guide.md
@@ -52,7 +52,7 @@ In addition, PySpark fully supports interactive use---simply run `./bin/pyspark`
 
 # Installing and Configuring PySpark
 
-PySpark requires Python 2.7 or higher.
+PySpark requires Python 2.6 or higher.
 PySpark applications are executed using a standard CPython interpreter in order to support Python modules that use C extensions.
 We have not tested PySpark with Python 3 or with alternative Python interpreters, such as [PyPy](http://pypy.org/) or [Jython](http://www.jython.org/).
 
@@ -152,7 +152,7 @@ Many of the methods also contain [doctests](http://docs.python.org/2/library/doc
 # Libraries
 
 [MLlib](mllib-guide.html) is also available in PySpark. To use it, you'll need
-[NumPy](http://www.numpy.org) version 1.7 or newer. The [MLlib guide](mllib-guide.html) contains
+[NumPy](http://www.numpy.org) version 1.7 or newer, and Python 2.7. The [MLlib guide](mllib-guide.html) contains
 some example applications.
 
 # Where to Go from Here
diff --git a/docs/quick-start.md b/docs/quick-start.md
index 153081bdaa286..13df6beea16e8 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -115,7 +115,7 @@ object SimpleApp {
   def main(args: Array[String]) {
     val logFile = "$YOUR_SPARK_HOME/README.md" // Should be some file on your system
     val sc = new SparkContext("local", "Simple App", "YOUR_SPARK_HOME",
-      List("target/scala-{{site.SCALA_VERSION}}/simple-project_{{site.SCALA_VERSION}}-1.0.jar"))
+      List("target/scala-{{site.SCALA_BINARY_VERSION}}/simple-project_{{site.SCALA_BINARY_VERSION}}-1.0.jar"))
     val logData = sc.textFile(logFile, 2).cache()
     val numAs = logData.filter(line => line.contains("a")).count()
     val numBs = logData.filter(line => line.contains("b")).count()
@@ -214,7 +214,7 @@ To build the program, we also write a Maven `pom.xml` file that lists Spark as a
   <dependencies>
     <dependency> <!-- Spark dependency -->
       <groupId>org.apache.spark</groupId>
-      <artifactId>spark-core_{{site.SCALA_VERSION}}</artifactId>
+      <artifactId>spark-core_{{site.SCALA_BINARY_VERSION}}</artifactId>
       <version>{{site.SPARK_VERSION}}</version>
     </dependency>
   </dependencies>
diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
index 3bd62646bab06..cd4509ede735a 100644
--- a/docs/running-on-yarn.md
+++ b/docs/running-on-yarn.md
@@ -15,7 +15,7 @@ This can be built by setting the Hadoop version and `SPARK_YARN` environment var
     SPARK_HADOOP_VERSION=2.0.5-alpha SPARK_YARN=true sbt/sbt assembly
 
 The assembled JAR will be something like this:
-`./assembly/target/scala-{{site.SCALA_VERSION}}/spark-assembly_{{site.SPARK_VERSION}}-hadoop2.0.5.jar`.
+`./assembly/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-assembly_{{site.SPARK_VERSION}}-hadoop2.0.5.jar`.
 
 The build process now also supports new YARN versions (2.2.x). See below.
 
@@ -25,7 +25,7 @@ The build process now also supports new YARN versions (2.2.x). See below.
 - The assembled jar can be installed into HDFS or used locally.
 - Your application code must be packaged into a separate JAR file.
 
-If you want to test out the YARN deployment mode, you can use the current Spark examples. A `spark-examples_{{site.SCALA_VERSION}}-{{site.SPARK_VERSION}}` file can be generated by running `sbt/sbt assembly`. NOTE: since the documentation you're reading is for Spark version {{site.SPARK_VERSION}}, we are assuming here that you have downloaded Spark {{site.SPARK_VERSION}} or checked it out of source control. If you are using a different version of Spark, the version numbers in the jar generated by the sbt package command will obviously be different.
+If you want to test out the YARN deployment mode, you can use the current Spark examples. A `spark-examples_{{site.SCALA_BINARY_VERSION}}-{{site.SPARK_VERSION}}` file can be generated by running `sbt/sbt assembly`. NOTE: since the documentation you're reading is for Spark version {{site.SPARK_VERSION}}, we are assuming here that you have downloaded Spark {{site.SPARK_VERSION}} or checked it out of source control. If you are using a different version of Spark, the version numbers in the jar generated by the sbt package command will obviously be different.
 
 # Configuration
 
@@ -78,9 +78,9 @@ For example:
     $ cp conf/log4j.properties.template conf/log4j.properties
 
     # Submit Spark's ApplicationMaster to YARN's ResourceManager, and instruct Spark to run the SparkPi example
-    $ SPARK_JAR=./assembly/target/scala-{{site.SCALA_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop2.0.5-alpha.jar \
+    $ SPARK_JAR=./assembly/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop2.0.5-alpha.jar \
         ./bin/spark-class org.apache.spark.deploy.yarn.Client \
-          --jar examples/target/scala-{{site.SCALA_VERSION}}/spark-examples-assembly-{{site.SPARK_VERSION}}.jar \
+          --jar examples/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-examples-assembly-{{site.SPARK_VERSION}}.jar \
           --class org.apache.spark.examples.SparkPi \
           --args yarn-standalone \
           --num-workers 3 \
@@ -117,13 +117,13 @@ In order to tune worker core/number/memory etc. You need to export environment v
 
 For example:
 
-    SPARK_JAR=./assembly/target/scala-{{site.SCALA_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop2.0.5-alpha.jar \
-    SPARK_YARN_APP_JAR=examples/target/scala-{{site.SCALA_VERSION}}/spark-examples-assembly-{{site.SPARK_VERSION}}.jar \
+    SPARK_JAR=./assembly/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop2.0.5-alpha.jar \
+    SPARK_YARN_APP_JAR=examples/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-examples-assembly-{{site.SPARK_VERSION}}.jar \
     ./bin/run-example org.apache.spark.examples.SparkPi yarn-client
 
 
-    SPARK_JAR=./assembly/target/scala-{{site.SCALA_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop2.0.5-alpha.jar \
-    SPARK_YARN_APP_JAR=examples/target/scala-{{site.SCALA_VERSION}}/spark-examples-assembly-{{site.SPARK_VERSION}}.jar \
+    SPARK_JAR=./assembly/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop2.0.5-alpha.jar \
+    SPARK_YARN_APP_JAR=examples/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-examples-assembly-{{site.SPARK_VERSION}}.jar \
     MASTER=yarn-client ./bin/spark-shell
 
 
@@ -133,7 +133,7 @@ See [Building Spark with Maven](building-with-maven.html) for instructions on ho
 
 # Important Notes
 
-- We do not requesting container resources based on the number of cores. Thus the numbers of cores given via command line arguments cannot be guaranteed.
+- Before Hadoop 2.2, YARN does not support cores in container resource requests. Thus, when running against an earlier version, the numbers of cores given via command line arguments cannot be passed to YARN.  Whether core requests are honored in scheduling decisions depends on which scheduler is in use and how it is configured.
 - The local directories used for spark will be the local directories configured for YARN (Hadoop Yarn config yarn.nodemanager.local-dirs). If the user specifies spark.local.dir, it will be ignored.
 - The --files and --archives options support specifying file names with the # similar to Hadoop. For example you can specify: --files localtest.txt#appSees.txt and this will upload the file you have locally named localtest.txt into HDFS but this will be linked to by the name appSees.txt and your application should use the name as appSees.txt to reference it when running on YARN.
 - The --addJars option allows the SparkContext.addJar function to work if you are using it with local files. It does not need to be used if you are using it with HDFS, HTTP, HTTPS, or FTP files.
diff --git a/docs/scala-programming-guide.md b/docs/scala-programming-guide.md
index c1ef46a1cded7..99412733d4268 100644
--- a/docs/scala-programming-guide.md
+++ b/docs/scala-programming-guide.md
@@ -17,12 +17,12 @@ This guide shows each of these features and walks through some samples. It assum
 
 # Linking with Spark
 
-Spark {{site.SPARK_VERSION}} uses Scala {{site.SCALA_VERSION}}. If you write applications in Scala, you'll need to use this same version of Scala in your program -- newer major versions may not work.
+Spark {{site.SPARK_VERSION}} uses Scala {{site.SCALA_BINARY_VERSION}}. If you write applications in Scala, you will need to use a compatible Scala version (e.g. {{site.SCALA_BINARY_VERSION}}.X) -- newer major versions may not work.
 
 To write a Spark application, you need to add a dependency on Spark. If you use SBT or Maven, Spark is available through Maven Central at:
 
     groupId = org.apache.spark
-    artifactId = spark-core_{{site.SCALA_VERSION}}
+    artifactId = spark-core_{{site.SCALA_BINARY_VERSION}}
     version = {{site.SPARK_VERSION}} 
 
 In addition, if you wish to access an HDFS cluster, you need to add a dependency on `hadoop-client` for your version of HDFS:
@@ -31,7 +31,7 @@ In addition, if you wish to access an HDFS cluster, you need to add a dependency
     artifactId = hadoop-client
     version = <your-hdfs-version>
 
-For other build systems, you can run `sbt/sbt assembly` to pack Spark and its dependencies into one JAR (`assembly/target/scala-{{site.SCALA_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop*.jar`), then add this to your CLASSPATH. Set the HDFS version as described [here](index.html#a-note-about-hadoop-versions).
+For other build systems, you can run `sbt/sbt assembly` to pack Spark and its dependencies into one JAR (`assembly/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop*.jar`), then add this to your CLASSPATH. Set the HDFS version as described [here](index.html#a-note-about-hadoop-versions).
 
 Finally, you need to import some Spark classes and implicit conversions into your program. Add the following lines:
 
@@ -168,9 +168,9 @@ The following tables list the transformations and actions currently supported (s
     Iterator[T] => Iterator[U] when running on an RDD of type T. </td>
 </tr>
 <tr>
-  <td> <b>mapPartitionsWithSplit</b>(<i>func</i>) </td>
+  <td> <b>mapPartitionsWithIndex</b>(<i>func</i>) </td>
   <td> Similar to mapPartitions, but also provides <i>func</i> with an integer value representing the index of
-  the split, so <i>func</i> must be of type (Int, Iterator[T]) => Iterator[U] when running on an RDD of type T.
+  the partition, so <i>func</i> must be of type (Int, Iterator[T]) => Iterator[U] when running on an RDD of type T.
   </td>
 </tr>
 <tr>
@@ -344,7 +344,7 @@ After the broadcast variable is created, it should be used instead of the value
 
 ## Accumulators
 
-Accumulators are variables that are only "added" to through an associative operation and can therefore be efficiently supported in parallel. They can be used to implement counters (as in MapReduce) or sums. Spark natively supports accumulators of type Int and Double, and programmers can add support for new types.
+Accumulators are variables that are only "added" to through an associative operation and can therefore be efficiently supported in parallel. They can be used to implement counters (as in MapReduce) or sums. Spark natively supports accumulators of numeric value types and standard mutable collections, and programmers can add support for new types.
 
 An accumulator is created from an initial value `v` by calling `SparkContext.accumulator(v)`. Tasks running on the cluster can then add to it using the `+=` operator. However, they cannot read its value. Only the driver program can read the accumulator's value, using its `value` method.
 
@@ -365,7 +365,7 @@ res2: Int = 10
 
 # Where to Go from Here
 
-You can see some [example Spark programs](http://spark.incubator.apache.org/examples.html) on the Spark website.
+You can see some [example Spark programs](http://spark.apache.org/examples.html) on the Spark website.
 In addition, Spark includes several samples in `examples/src/main/scala`. Some of them have both Spark versions and local (non-parallel) versions, allowing you to see what had to be changed to make the program run on a cluster. You can run them using by passing the class name to the `bin/run-example` script included in Spark; for example:
 
     ./bin/run-example org.apache.spark.examples.SparkPi
diff --git a/docs/spark-debugger.md b/docs/spark-debugger.md
index 11c51d5cde7c9..891c2bfa8943d 100644
--- a/docs/spark-debugger.md
+++ b/docs/spark-debugger.md
@@ -2,7 +2,7 @@
 layout: global
 title: The Spark Debugger
 ---
-**Summary:** The Spark debugger provides replay debugging for deterministic (logic) errors in Spark programs. It's currently in development, but you can try it out in the [arthur branch](https://github.com/apache/incubator-spark/tree/arthur).
+**Summary:** The Spark debugger provides replay debugging for deterministic (logic) errors in Spark programs. It's currently in development, but you can try it out in the [arthur branch](https://github.com/apache/spark/tree/arthur).
 
 ## Introduction
 
@@ -19,7 +19,7 @@ For deterministic errors, debugging a Spark program is now as easy as debugging
 
 ## Approach
 
-As your Spark program runs, the slaves report key events back to the master -- for example, RDD creations, RDD contents, and uncaught exceptions. (A full list of event types is in [EventLogging.scala](https://github.com/apache/incubator-spark/blob/arthur/core/src/main/scala/spark/EventLogging.scala).) The master logs those events, and you can load the event log into the debugger after your program is done running.
+As your Spark program runs, the slaves report key events back to the master -- for example, RDD creations, RDD contents, and uncaught exceptions. (A full list of event types is in [EventLogging.scala](https://github.com/apache/spark/blob/arthur/core/src/main/scala/spark/EventLogging.scala).) The master logs those events, and you can load the event log into the debugger after your program is done running.
 
 _A note on nondeterminism:_ For fault recovery, Spark requires RDD transformations (for example, the function passed to `RDD.map`) to be deterministic. The Spark debugger also relies on this property, and it can also warn you if your transformation is nondeterministic. This works by checksumming the contents of each RDD and comparing the checksums from the original execution to the checksums after recomputing the RDD in the debugger.
 
diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
index 2a186261b754a..3388c14ec4d48 100644
--- a/docs/spark-standalone.md
+++ b/docs/spark-standalone.md
@@ -151,7 +151,7 @@ You can also pass an option `-c <numCores>` to control the number of cores that
 You may also run your application entirely inside of the cluster by submitting your application driver using the submission client. The syntax for submitting applications is as follows:
 
 
-    ./spark-class org.apache.spark.deploy.Client launch 
+    ./bin/spark-class org.apache.spark.deploy.Client launch
        [client-options] \
        <cluster-url> <application-jar-url> <main-class> \
        [application-options]
@@ -176,7 +176,7 @@ Once you submit a driver program, it will appear in the cluster management UI at
 be assigned an identifier. If you'd like to prematurely terminate the program, you can do so using
 the same client:
 
-    ./spark-class org.apache.spark.deploy.client.DriverClient kill <driverId>
+    ./bin/spark-class org.apache.spark.deploy.Client kill <driverId>
 
 # Resource Scheduling
 
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
index 07c4c55633929..4985c52a11ada 100644
--- a/docs/streaming-programming-guide.md
+++ b/docs/streaming-programming-guide.md
@@ -7,74 +7,457 @@ title: Spark Streaming Programming Guide
 {:toc}
 
 # Overview
-A Spark Streaming application is very similar to a Spark application; it consists of a *driver program* that runs the user's `main` function and continuous executes various *parallel operations* on input streams of data. The main abstraction Spark Streaming provides is a *discretized stream* (DStream), which is a continuous sequence of RDDs (distributed collections of elements) representing a continuous stream of data. DStreams can be created from live incoming data (such as data from a socket, Kafka, etc.) or can be generated by transforming existing DStreams using parallel operators like `map`, `reduce`, and `window`. The basic processing model is as follows: 
-(i) While a Spark Streaming driver program is running, the system receives data from various sources and and divides it into batches. Each batch of data is treated as an RDD, that is, an immutable parallel collection of data. These input RDDs are saved in memory and replicated to two nodes for fault-tolerance. This sequence of RDDs is collectively called an InputDStream.
-(ii) Data received by InputDStreams are processed using DStream operations. Since all data is represented as RDDs and all DStream operations as RDD operations, data is automatically recovered in the event of node failures.  
+Spark Streaming is an extension of the core Spark API that allows enables high-throughput,
+fault-tolerant stream processing of live data streams. Data can be ingested from many sources
+like Kafka, Flume, Twitter, ZeroMQ or plain old TCP sockets and be processed using complex
+algorithms expressed with high-level functions like `map`, `reduce`, `join` and `window`.
+Finally, processed data can be pushed out to filesystems, databases,
+and live dashboards. In fact, you can apply Spark's in-built
+[machine learning](mllib-guide.html) algorithms, and
+[graph processing](graphx-programming-guide.html) algorithms on data streams.
+
+<p style="text-align: center;">
+  <img
+    src="img/streaming-arch.png"
+    title="Spark Streaming architecture"
+    alt="Spark Streaming"
+    width="70%"
+  />
+</p>
+
+Internally, it works as follows. Spark Streaming receives live input data streams and divides
+the data into batches, which are then processed by the Spark engine to generate the final
+stream of results in batches.
+
+<p style="text-align: center;">
+  <img src="img/streaming-flow.png"
+       title="Spark Streaming data flow"
+       alt="Spark Streaming"
+       width="70%" />
+</p>
+
+Spark Streaming provides a high-level abstraction called *discretized stream* or *DStream*,
+which represents a continuous stream of data. DStreams can be created either from input data
+stream from sources such as Kafka and Flume, or by applying high-level
+operations on other DStreams. Internally, a DStream is represented as a sequence of
+[RDDs](api/core/index.html#org.apache.spark.rdd.RDD).
+
+This guide shows you how to start writing Spark Streaming programs with DStreams. You can
+write Spark Streaming programs in Scala or Java, both of which are presented in this guide. You
+will find tabs throughout this guide that let you choose between Scala and Java
+code snippets.
+
+***************************************************************************************************  
+
+# A Quick Example
+Before we go into the details of how to write your own Spark Streaming program,
+let's take a quick look at what a simple Spark Streaming program looks like. Let's say we want to
+count the number of words in text data received from a data server listening on a TCP
+socket. All you need to
+do is as follows.
+
+<div class="codetabs">
+<div data-lang="scala"  markdown="1" >
+
+First, we create a
+[StreamingContext](api/streaming/index.html#org.apache.spark.streaming.StreamingContext) object,
+which is the main entry point for all streaming
+functionality. Besides Spark's configuration, we specify that any DStream will be processed
+in 1 second batches.
 
-This guide shows some how to start programming with DStreams. 
+{% highlight scala %}
+// Create a StreamingContext with a SparkConf configuration
+val ssc = new StreamingContext(sparkConf, Seconds(1))
+{% endhighlight %}
 
-# Linking with Spark Streaming
+Using this context, we then create a new DStream
+by specifying the IP address and port of the data server.
 
-Add the following SBT or Maven dependency to your project to use Spark Streaming:
+{% highlight scala %}
+// Create a DStream that will connect to serverIP:serverPort
+val lines = ssc.socketTextStream(serverIP, serverPort)
+{% endhighlight %}
 
-    groupId = org.apache.spark
-    artifactId = spark-streaming_{{site.SCALA_VERSION}}
-    version = {{site.SPARK_VERSION}}
+This `lines` DStream represents the stream of data that will be received from the data
+server. Each record in this DStream is a line of text. Next, we want to split the lines by
+space into words.
 
-For ingesting data from sources like Kafka and Flume, add the corresponding artifact `spark-streaming-xyz_{{site.SCALA_VERSION}}` to the dependencies. For example, `spark-streaming-kafka_{{site.SCALA_VERSION}}` for Kafka, `spark-streaming-flume_{{site.SCALA_VERSION}}`, etc.  Please refer to the [Apache repository](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.spark%22%20AND%20v%3A%22{{site.SPARK_VERSION}}%22) for the full list of supported sources / artifacts.
+{% highlight scala %}
+// Split each line into words
+val words = lines.flatMap(_.split(" "))
+{% endhighlight %}
 
-# Initializing Spark Streaming
-The first thing a Spark Streaming program must do is create a `StreamingContext` object, which tells Spark how to access a cluster. A `StreamingContext` can be created by using
+`flatMap` is a one-to-many DStream operation that creates a new DStream by
+generating multiple new records from each record int the source DStream. In this case,
+each line will be split into multiple words and the stream of words is represented as the
+`words` DStream.  Next, we want to count these words.
 
 {% highlight scala %}
-new StreamingContext(master, appName, batchDuration, [sparkHome], [jars])
+// Count each word in each batch
+val pairs = words.map(word => (word, 1))
+val wordCounts = pairs.reduceByKey(_ + _)
+
+// Print a few of the counts to the console
+wordCount.print()
 {% endhighlight %}
 
-The `master` parameter is a standard [Spark cluster URL](scala-programming-guide.html#master-urls) and can be "local" for local testing. The `appName` is a name of your program, which will be shown on your cluster's web UI. The `batchDuration` is the size of the batches (as explained earlier). This must be set carefully such that the cluster can keep up with the processing of the data streams. Start with something conservative like 5 seconds. See the [Performance Tuning](#setting-the-right-batch-size) section for a detailed discussion. Finally, `sparkHome` and `jars` are optional parameters, which need to be set when running on a cluster to specify the location of your code, as described in the [Spark programming guide](scala-programming-guide.html#deploying-code-on-a-cluster).
+The `words` DStream is further mapped (one-to-one transformation) to a DStream of `(word,
+1)` pairs, which is then reduced to get the frequency of words in each batch of data.
+Finally, `wordCounts.print()` will print a few of the counts generated every second.
+
+Note that when these lines are executed, Spark Streaming only sets up the computation it
+will perform when it is started, and no real processing has started yet. To start the processing
+after all the transformations have been setup, we finally call
 
 {% highlight scala %}
-new SparkConf(conf, batchDuration)
+ssc.start()             // Start the computation
+ssc.awaitTermination()  // Wait for the computation to terminate
+{% endhighlight %}
+
+The complete code can be found in the Spark Streaming example
+[NetworkWordCount]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala).
+<br>
+
+</div>
+<div data-lang="java" markdown="1">
+
+First, we create a
+[JavaStreamingContext](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaStreamingContext) object,
+which is the main entry point for all streaming
+functionality. Besides Spark's configuration, we specify that any DStream would be processed
+in 1 second batches.
+
+{% highlight java %}
+// Create a StreamingContext with a SparkConf configuration
+JavaStreamingContext jssc = StreamingContext(sparkConf, new Duration(1000))
+{% endhighlight %}
+
+Using this context, we then create a new DStream
+by specifying the IP address and port of the data server.
+
+{% highlight java %}
+// Create a DStream that will connect to serverIP:serverPort
+JavaDStream<String> lines = jssc.socketTextStream(serverIP, serverPort);
+{% endhighlight %}
+
+This `lines` DStream represents the stream of data that will be received from the data
+server. Each record in this stream is a line of text. Then, we want to split the the lines by
+space into words.
+
+{% highlight java %}
+// Split each line into words
+JavaDStream<String> words = lines.flatMap(
+  new FlatMapFunction<String, String>() {
+    @Override public Iterable<String> call(String x) {
+      return Lists.newArrayList(x.split(" "));
+    }
+  });
+{% endhighlight %}
+
+`flatMap` is a DStream operation that creates a new DStream by
+generating multiple new records from each record in the source DStream. In this case,
+each line will be split into multiple words and the stream of words is represented as the
+`words` DStream. Note that we defined the transformation using a
+[FlatMapFunction](api/core/index.html#org.apache.spark.api.java.function.FlatMapFunction) object.
+As we will discover along the way, there are a number of such convenience classes in the Java API
+that help define DStream transformations.
+
+Next, we want to count these words.
+
+{% highlight java %}
+// Count each word in each batch
+JavaPairDStream<String, Integer> pairs = words.map(
+  new PairFunction<String, String, Integer>() {
+    @Override public Tuple2<String, Integer> call(String s) throws Exception {
+      return new Tuple2<String, Integer>(s, 1);
+    }
+  });
+JavaPairDStream<String, Integer> wordCounts = pairs.reduceByKey(
+  new Function2<Integer, Integer, Integer>() {
+    @Override public Integer call(Integer i1, Integer i2) throws Exception {
+      return i1 + i2;
+    }
+  });
+wordCount.print();     // Print a few of the counts to the console
 {% endhighlight %}
 
-where `conf` is a [SparkConf](api/core/index.html#org.apache.spark.SparkConf)
-object used for more advanced configuration. In both cases, a [SparkContext](api/core/index.html#org.apache.spark.SparkContext) is created as well which can be accessed with `streamingContext.sparkContext`.
+The `words` DStream is further mapped (one-to-one transformation) to a DStream of `(word,
+1)` pairs, using a [PairFunction](api/core/index.html#org.apache.spark.api.java.function.PairFunction)
+object. Then, it is reduced to get the frequency of words in each batch of data,
+using a [Function2](api/core/index.html#org.apache.spark.api.java.function.Function2) object.
+Finally, `wordCounts.print()` will print a few of the counts generated every second.
+
+Note that when these lines are executed, Spark Streaming only sets up the computation it
+will perform when it is started, and no real processing has started yet. To start the processing
+after all the transformations have been setup, we finally call
 
-# Attaching Input Sources
-The StreamingContext is used to creating input streams from data sources:
+{% highlight java %}
+jssc.start();              // Start the computation
+jssc.awaitTermination();   // Wait for the computation to terminate
+{% endhighlight %}
+
+The complete code can be found in the Spark Streaming example
+[JavaNetworkWordCount]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/java/org/apache/spark/streaming/examples/JavaNetworkWordCount.java).
+<br>
+
+</div>
+</div>
+
+If you have already [downloaded](index.html#downloading) and [built](index.html#building) Spark,
+you can run this example as follows. You will first need to run Netcat
+(a small utility found in most Unix-like systems) as a data server by using
+
+{% highlight bash %}
+$ nc -lk 9999
+{% endhighlight %}
+
+Then, in a different terminal, you can start the example by using
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+{% highlight bash %}
+$ ./bin/run-example org.apache.spark.streaming.examples.NetworkWordCount local[2] localhost 9999
+{% endhighlight %}
+</div>
+<div data-lang="java" markdown="1">
+{% highlight bash %}
+$ ./bin/run-example org.apache.spark.streaming.examples.JavaNetworkWordCount local[2] localhost 9999
+{% endhighlight %}
+</div>
+</div>
+
+
+Then, any lines typed in the terminal running the netcat server will be counted and printed on
+screen every second. It will look something like this.
+
+<table width="100%">
+    <td>
+{% highlight bash %}
+# TERMINAL 1:
+# Running Netcat
+
+$ nc -lk 9999
+
+hello world
+
+
+
+...
+{% endhighlight %}
+    </td>
+    <td width="2%"></td>
+    <td>
+{% highlight bash %}
+# TERMINAL 2: RUNNING NetworkWordCount or JavaNetworkWordCount
+
+$ ./bin/run-example org.apache.spark.streaming.examples.NetworkWordCount local[2] localhost 9999
+...
+-------------------------------------------
+Time: 1357008430000 ms
+-------------------------------------------
+(hello,1)
+(world,1)
+...
+{% endhighlight %}
+    </td>
+</table>
+
+***************************************************************************************************  
+
+# Basics
+
+Next, we move beyond the simple example and elaborate on the basics of Spark Streaming that you
+need to know to write your streaming applications.
+
+## Linking
+
+To write your own Spark Streaming program, you will have to add the following dependency to your
+ SBT or Maven project:
+
+    groupId = org.apache.spark
+    artifactId = spark-streaming_{{site.SCALA_BINARY_VERSION}}
+    version = {{site.SPARK_VERSION}}
+
+For ingesting data from sources like Kafka and Flume that are not present in the Spark
+Streaming core
+ API, you will have to add the corresponding
+artifact `spark-streaming-xyz_{{site.SCALA_BINARY_VERSION}}` to the dependencies. For example,
+some of the common ones are as follows.
+
+
+<table class="table">
+<tr><th>Source</th><th>Artifact</th></tr>
+<tr><td> Kafka </td><td> spark-streaming-kafka_{{site.SCALA_BINARY_VERSION}} </td></tr>
+<tr><td> Flume </td><td> spark-streaming-flume_{{site.SCALA_BINARY_VERSION}} </td></tr>
+<tr><td> Twitter </td><td> spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}} </td></tr>
+<tr><td> ZeroMQ </td><td> spark-streaming-zeromq_{{site.SCALA_BINARY_VERSION}} </td></tr>
+<tr><td> MQTT </td><td> spark-streaming-mqtt_{{site.SCALA_BINARY_VERSION}} </td></tr>
+<tr><td> </td><td></td></tr>
+</table>
+
+For an up-to-date list, please refer to the
+[Apache repository](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.spark%22%20AND%20v%3A%22{{site.SPARK_VERSION}}%22)
+for the full list of supported sources and artifacts.
+
+## Initializing
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+To initialize a Spark Streaming program in Scala, a
+[`StreamingContext`](api/streaming/index.html#org.apache.spark.streaming.StreamingContext)
+object has to be created, which is the main entry point of all Spark Streaming functionality.
+A `StreamingContext` object can be created by using
 
 {% highlight scala %}
-// Assuming ssc is the StreamingContext
-ssc.textFileStream(directory)    // Creates a stream that monitors and processes new files in a HDFS directory
-ssc.socketStream(hostname, port) // Creates a stream that uses a TCP socket to read data from hostname:port
+new StreamingContext(master, appName, batchDuration, [sparkHome], [jars])
 {% endhighlight %}
+</div>
+<div data-lang="java" markdown="1">
 
-The core Spark Streaming API provides input streams for files, sockets, and Akka actors. Additional functionality for Kafka, Flume, ZeroMQ, Twitter, etc. can be imported by adding the right dependencies as explained in the [linking](#linking-with-spark-streaming) section.
+To initialize a Spark Streaming program in Java, a
+[`JavaStreamingContext`](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaStreamingContext)
+object has to be created, which is the main entry point of all Spark Streaming functionality.
+A `JavaStreamingContext` object can be created by using
 
-# DStream Operations
-Data received from the input streams can be processed using _DStream operations_. There are two kinds of operations - _transformations_ and _output operations_. Similar to RDD transformations, DStream transformations operate on one or more DStreams to create new DStreams with transformed data. After applying a sequence of transformations to the input streams, output operations need to called, which write data out to an external data sink like a file system or a database.
+{% highlight scala %}
+new JavaStreamingContext(master, appName, batchInterval, [sparkHome], [jars])
+{% endhighlight %}
+</div>
+</div>
+
+The `master` parameter is a standard [Spark cluster URL](scala-programming-guide.html#master-urls)
+and can be "local" for local testing. The `appName` is a name of your program,
+which will be shown on your cluster's web UI. The `batchInterval` is the size of the batches,
+as explained earlier. Finally, the last two parameters are needed to deploy your code to a cluster
+ if running in distributed mode, as described in the
+ [Spark programming guide](scala-programming-guide.html#deploying-code-on-a-cluster).
+ Additionally, the underlying SparkContext can be accessed as
+`streamingContext.sparkContext`.
+
+The batch interval must be set based on the latency requirements of your application
+and available cluster resources. See the [Performance Tuning](#setting-the-right-batch-size)
+section for more details.
+
+## DStreams
+*Discretized Stream* or *DStream* is the basic abstraction provided by Spark Streaming.
+It represents a continuous stream of data, either the input data stream received from source,
+or the processed data stream generated by transforming the input stream. Internally,
+it is represented by a continuous sequence of RDDs, which is Spark's abstraction of an immutable,
+distributed dataset. Each RDD in a DStream contains data from a certain interval,
+as shown in the following figure.
+
+<p style="text-align: center;">
+  <img src="img/streaming-dstream.png"
+       title="Spark Streaming data flow"
+       alt="Spark Streaming"
+       width="70%" />
+</p>
+
+Any operation applied on a DStream translates to operations on the underlying RDDs. For example,
+in the [earlier example](#a-quick-example) of converting a stream of lines to words,
+the `flatmap` operation is applied on each RDD in the `lines` DStream to generate the RDDs of the
+ `words` DStream. This is shown the following figure.
+
+<p style="text-align: center;">
+  <img src="img/streaming-dstream-ops.png"
+       title="Spark Streaming data flow"
+       alt="Spark Streaming"
+       width="70%" />
+</p>
+
+
+These underlying RDD transformations are computed by the Spark engine. The DStream operations
+hide most of these details and provides the developer with higher-level API for convenience.
+These operations are discussed in detail in later sections.
+
+## Input Sources
+
+We have already taken a look at the `streamingContext.socketTextStream(...)` in the [quick
+example](#a-quick-example) which creates a DStream from text
+data received over a TCP socket connection. Besides sockets, the core Spark Streaming API provides
+methods for creating DStreams from files and Akka actors as input sources.
+
+Specifically, for files, the DStream can be created as
+
+<div class="codetabs">
+<div data-lang="scala">
+{% highlight scala %}
+streamingContext.fileStream(dataDirectory)
+{% endhighlight %}
+</div>
+<div data-lang="java">
+{% highlight java %}
+javaStreamingContext.fileStream(dataDirectory);
+{% endhighlight %}
+</div>
+</div>
+
+Spark Streaming will monitor the directory `dataDirectory` for any Hadoop-compatible filesystem
+and process any files created in that directory. Note that
+
+ * The files must have the same data format.
+ * The files must be created in the `dataDirectory` by atomically *moving* or *renaming* them into
+ the data directory.
+ * Once moved the files must not be changed.
+
+For more details on streams from files, Akka actors and sockets,
+see the API documentations of the relevant functions in
+[StreamingContext](api/streaming/index.html#org.apache.spark.streaming.StreamingContext) for
+Scala and [JavaStreamingContext](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaStreamingContext)
+ for Java.
+
+Additional functionality for creating DStreams from sources such as Kafka, Flume, and Twitter
+can be imported by adding the right dependencies as explained in an
+[earlier](#linking) section. To take the
+case of Kafka, after adding the artifact `spark-streaming-kafka_{{site.SCALA_BINARY_VERSION}}` to the
+project dependencies, you can create a DStream from Kafka as
+
+<div class="codetabs">
+<div data-lang="scala">
+{% highlight scala %}
+import org.apache.spark.streaming.kafka._
+KafkaUtils.createStream(streamingContext, kafkaParams, ...)
+{% endhighlight %}
+</div>
+<div data-lang="java">
+{% highlight java %}
+import org.apache.spark.streaming.kafka.*
+KafkaUtils.createStream(javaStreamingContext, kafkaParams, ...);
+{% endhighlight %}
+</div>
+</div>
 
-## Transformations
+For more details on these additional sources, see the corresponding [API documentation]
+(#where-to-go-from-here). Furthermore, you can also implement your own custom receiver
+for your sources. See the [Custom Receiver Guide](streaming-custom-receivers.html).
 
-DStreams support many of the transformations available on normal Spark RDD's:
+## Operations
+There are two kinds of DStream operations - _transformations_ and _output operations_. Similar to
+RDD transformations, DStream transformations operate on one or more DStreams to create new DStreams
+with transformed data. After applying a sequence of transformations to the input streams, output
+operations need to called, which write data out to an external data sink, such as a filesystem or a
+database.
+
+### Transformations
+DStreams support many of the transformations available on normal Spark RDD's. Some of the
+common ones are as follows.
 
 <table class="table">
-<tr><th style="width:30%">Transformation</th><th>Meaning</th></tr>
+<tr><th style="width:25%">Transformation</th><th>Meaning</th></tr>
 <tr>
   <td> <b>map</b>(<i>func</i>) </td>
-  <td> Returns a new DStream formed by passing each element of the source DStream through a function <i>func</i>. </td>
-</tr>
-<tr>
-  <td> <b>filter</b>(<i>func</i>) </td>
-  <td> Returns a new DStream formed by selecting those elements of the source DStream on which <i>func</i> returns true. </td>
+  <td> Return a new DStream by passing each element of the source DStream through a
+  function <i>func</i>. </td>
 </tr>
 <tr>
   <td> <b>flatMap</b>(<i>func</i>) </td>
-  <td> Similar to map, but each input item can be mapped to 0 or more output items (so <i>func</i> should return a <code>Seq</code> rather than a single item). </td>
+  <td> Similar to map, but each input item can be mapped to 0 or more output items. </td>
 </tr>
 <tr>
-  <td> <b>mapPartitions</b>(<i>func</i>) </td>
-  <td> Similar to map, but runs separately on each partition (block) of the DStream, so <i>func</i> must be of type
-    Iterator[T] => Iterator[U] when running on an DStream of type T. </td>
+  <td> <b>filter</b>(<i>func</i>) </td>
+  <td> Return a new DStream by selecting only the records of the source DStream on which
+  <i>func</i> returns true. </td>
 </tr>
 <tr>
   <td> <b>repartition</b>(<i>numPartitions</i>) </td>
@@ -82,329 +465,681 @@ DStreams support many of the transformations available on normal Spark RDD's:
 </tr>
 <tr>
   <td> <b>union</b>(<i>otherStream</i>) </td>
-  <td> Return a new DStream that contains the union of the elements in the source DStream and the argument DStream. </td>
+  <td> Return a new DStream that contains the union of the elements in the source DStream and
+  <i>otherDStream</i>. </td>
 </tr>
 <tr>
   <td> <b>count</b>() </td>
-  <td> Returns a new DStream of single-element RDDs by counting the number of elements in each RDD of the source DStream.  </td>
+  <td> Return a new DStream of single-element RDDs by counting the number of elements in each RDD
+   of the source DStream. </td>
 </tr>
 <tr>
   <td> <b>reduce</b>(<i>func</i>) </td>
-  <td> Returns a new DStream of single-element RDDs by aggregating the elements in each RDD of the source DStream using a function <i>func</i> (which takes two arguments and returns one). The function should be associative so that it can be computed in parallel. </td>
+  <td> Return a new DStream of single-element RDDs by aggregating the elements in each RDD of the
+  source DStream using a function <i>func</i> (which takes two arguments and returns one).
+  The function should be associative so that it can be computed in parallel. </td>
 </tr>
 <tr>
   <td> <b>countByValue</b>() </td>
-  <td> When called on a DStream of elements of type K, returns a new DStream of (K, Long) pairs where the value of each key is its frequency in each RDD of the source DStream.  </td>
-</tr>
-<tr>
-  <td> <b>groupByKey</b>([<i>numTasks</i>]) </td>
-  <td> When called on a DStream of (K, V) pairs, returns a new DStream of (K, Seq[V]) pairs by grouping together all the values of each key in the RDDs of the source DStream. <br />
-  <b>Note:</b> By default, this uses Spark's default number of parallel tasks (2 for local machine, 8 for a cluster) to do the grouping. You can pass an optional <code>numTasks</code> argument to set a different number of tasks.
-</td>
+  <td> When called on a DStream of elements of type K, return a new DStream of (K, Long) pairs
+  where the value of each key is its frequency in each RDD of the source DStream.  </td>
 </tr>
 <tr>
   <td> <b>reduceByKey</b>(<i>func</i>, [<i>numTasks</i>]) </td>
-  <td> When called on a DStream of (K, V) pairs, returns a new DStream of (K, V) pairs where the values for each key are aggregated using the given reduce function. Like in <code>groupByKey</code>, the number of reduce tasks is configurable through an optional second argument. </td>
+  <td> When called on a DStream of (K, V) pairs, return a new DStream of (K, V) pairs where the
+  values for each key are aggregated using the given reduce function. <b>Note:</b> By default,
+  this uses Spark's default number of parallel tasks (2 for local machine, 8 for a cluster) to
+  do the grouping. You can pass an optional <code>numTasks</code> argument to set a different
+  number of tasks.</td>
 </tr>
 <tr>
   <td> <b>join</b>(<i>otherStream</i>, [<i>numTasks</i>]) </td>
-  <td> When called on two DStreams of (K, V) and (K, W) pairs, returns a new DStream of (K, (V, W)) pairs with all pairs of elements for each key. </td>
+  <td> When called on two DStreams of (K, V) and (K, W) pairs, return a new DStream of (K, (V, W))
+  pairs with all pairs of elements for each key. </td>
 </tr>
 <tr>
   <td> <b>cogroup</b>(<i>otherStream</i>, [<i>numTasks</i>]) </td>
-  <td> When called on DStream of (K, V) and (K, W) pairs, returns a new DStream of (K, Seq[V], Seq[W]) tuples.</td>
+  <td> When called on DStream of (K, V) and (K, W) pairs, return a new DStream of
+  (K, Seq[V], Seq[W]) tuples.</td>
 </tr>
 <tr>
   <td> <b>transform</b>(<i>func</i>) </td>
-  <td> Returns a new DStream by applying func (a RDD-to-RDD function) to every RDD of the stream. This can be used to do arbitrary RDD operations on the DStream. </td>
+  <td> Return a new DStream by applying a RDD-to-RDD function to every RDD of the source DStream.
+  This can be used to do arbitrary RDD operations on the DStream. </td>
 </tr>
 <tr>
   <td> <b>updateStateByKey</b>(<i>func</i>) </td>
-  <td> Return a new "state" DStream where the state for each key is updated by applying the given function on the previous state of the key and the new values of each key. This can be used to track session state by using the session-id as the key and updating the session state as new data is received.</td>
-</tr>
-
-</table>
-
-Spark Streaming features windowed computations, which allow you to apply transformations over a sliding window of data. All window functions take a <i>windowDuration</i>, which represents the width of the window and a <i>slideTime</i>, which represents the frequency during which the window is calculated.
-
-<table class="table">
-<tr><th style="width:30%">Transformation</th><th>Meaning</th></tr>
-<tr>
-  <td> <b>window</b>(<i>windowDuration</i>, <i>slideDuration</i>) </td>
-  <td> Return a new DStream which is computed based on windowed batches of the source DStream. <i>windowDuration</i> is the width of the window and <i>slideTime</i> is the frequency during which the window is calculated. Both times must be multiples of the batch interval.
-  </td>
-</tr>
-<tr>
-  <td> <b>countByWindow</b>(<i>windowDuration</i>, <i>slideDuration</i>) </td>
-  <td> Return a sliding count of elements in the stream. <i>windowDuration</i> and <i>slideDuration</i> are exactly as defined in <code>window()</code>.
-  </td>
-</tr>
-<tr>
-  <td> <b>reduceByWindow</b>(<i>func</i>, <i>windowDuration</i>, <i>slideDuration</i>) </td>
-  <td> Return a new single-element stream, created by aggregating elements in the stream over a sliding interval using <i>func</i>. The function should be associative so that it can be computed correctly in parallel. <i>windowDuration</i> and <i>slideDuration</i> are exactly as defined in <code>window()</code>.
-  </td>
-</tr>
-<tr>
-  <td> <b>groupByKeyAndWindow</b>(<i>windowDuration</i>, <i>slideDuration</i>, [<i>numTasks</i>])
-  </td>
-  <td> When called on a DStream of (K, V) pairs, returns a new DStream of (K, Seq[V]) pairs by grouping together values of each key over batches in a sliding window. <br />
-<b>Note:</b> By default, this uses Spark's default number of parallel tasks (2 for local machine, 8 for a cluster) to do the grouping. You can pass an optional <code>numTasks</code> argument to set a different number of tasks.</td>
-</tr>
-<tr>
-  <td> <b>reduceByKeyAndWindow</b>(<i>func</i>, <i>windowDuration</i>, <i>slideDuration</i>, [<i>numTasks</i>]) </td>
-  <td> When called on a DStream of (K, V) pairs, returns a new DStream of (K, V) pairs where the values for each key are aggregated using the given reduce function <i>func</i> over batches in a sliding window. Like in <code>groupByKeyAndWindow</code>, the number of reduce tasks is configurable through an optional second argument.
- <i>windowDuration</i> and <i>slideDuration</i> are exactly as defined in <code>window()</code>.
-</td> 
-</tr>
-<tr>
-  <td> <b>reduceByKeyAndWindow</b>(<i>func</i>, <i>invFunc</i>, <i>windowDuration</i>, <i>slideDuration</i>, [<i>numTasks</i>]) </td>
-  <td> A more efficient version of the above <code>reduceByKeyAndWindow()</code> where the reduce value of each window is calculated
-  incrementally using the reduce values of the previous window. This is done by reducing the new data that enter the sliding window, and "inverse reducing" the old data that leave the window. An example would be that of "adding" and "subtracting" counts of keys as the window slides. However, it is applicable to only "invertible reduce functions", that is, those reduce functions which have a corresponding "inverse reduce" function (taken as parameter <i>invFunc</i>. Like in <code>groupByKeyAndWindow</code>, the number of reduce tasks is configurable through an optional second argument.
- <i>windowDuration</i> and <i>slideDuration</i> are exactly as defined in <code>window()</code>.
-</td>
-</tr>
-<tr>
-  <td> <b>countByValueAndWindow</b>(<i>windowDuration</i>, <i>slideDuration</i>, [<i>numTasks</i>]) </td>
-  <td> When called on a DStream of (K, V) pairs, returns a new DStream of (K, Long) pairs where the value of each key is its frequency within a sliding window. Like in <code>groupByKeyAndWindow</code>, the number of reduce tasks is configurable through an optional second argument.
- <i>windowDuration</i> and <i>slideDuration</i> are exactly as defined in <code>window()</code>.
-</td>
+  <td> Return a new "state" DStream where the state for each key is updated by applying the
+  given function on the previous state of the key and the new values for the key. This can be
+  used to maintain arbitrary state data for each ket.</td>
 </tr>
+<tr><td></td><td></td></tr>
 </table>
 
-A complete list of DStream operations is available in the API documentation of [DStream](api/streaming/index.html#org.apache.spark.streaming.dstream.DStream) and [PairDStreamFunctions](api/streaming/index.html#org.apache.spark.streaming.dstream.PairDStreamFunctions).
+The last two transformations are worth highlighting again.
 
-## Output Operations
-When an output operator is called, it triggers the computation of a stream. Currently the following output operators are defined:
+<h4>UpdateStateByKey Operation</h4>
 
-<table class="table">
-<tr><th style="width:30%">Operator</th><th>Meaning</th></tr>
-<tr>
-  <td> <b>foreachRDD</b>(<i>func</i>) </td>
-  <td> The fundamental output operator. Applies a function, <i>func</i>, to each RDD generated from the stream. This function should have side effects, such as printing output, saving the RDD to external files, or writing it over the network to an external system. </td>
-</tr>
+The `updateStateByKey` operation allows
+you to main arbitrary stateful computation, where you want to maintain some state data and
+continuously update it with new information. To use this, you will have to do two steps.
 
-<tr>
-  <td> <b>print</b>() </td>
-  <td> Prints first ten elements of every batch of data in a DStream on the driver. </td>
-</tr>
+1. Define the state - The state can be of arbitrary data type.
+1. Define the state update function - Specify with a function how to update the state using the
+previous state and the new values from input stream.
 
-<tr>
-  <td> <b>saveAsObjectFiles</b>(<i>prefix</i>, [<i>suffix</i>]) </td>
-  <td> Save this DStream's contents as a <code>SequenceFile</code> of serialized objects. The file name at each batch interval is generated based on <i>prefix</i> and <i>suffix</i>: <i>"prefix-TIME_IN_MS[.suffix]"</i>.
-  </td>
-</tr>
+Let's illustrate this with an example. Say you want to maintain a running count of each word
+seen in a text data stream. Here, the running count is the state and it is an integer. We
+define the update function as
 
-<tr>
-  <td> <b>saveAsTextFiles</b>(<i>prefix</i>, [<i>suffix</i>]) </td>
-  <td> Save this DStream's contents as a text files. The file name at each batch interval is generated based on <i>prefix</i> and <i>suffix</i>: <i>"prefix-TIME_IN_MS[.suffix]"</i>. </td>
-</tr>
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
 
-<tr>
-  <td> <b>saveAsHadoopFiles</b>(<i>prefix</i>, [<i>suffix</i>]) </td>
-  <td> Save this DStream's contents as a Hadoop file. The file name at each batch interval is generated based on <i>prefix</i> and <i>suffix</i>: <i>"prefix-TIME_IN_MS[.suffix]"</i>. </td>
-</tr>
+{% highlight scala %}
+def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = {
+    val newCount = ...  // add the new values with the previous running count to get the new count
+    Some(newCount)
+}
+{% endhighlight %}
 
-</table>
+This is applied on a DStream containing words (say, the `pairs` DStream containing `(word,
+1)` pairs in the [earlier example](#a-quick-example)).
 
-# Starting the Streaming computation
-All the above DStream operations are completely lazy, that is, the operations will start executing only after the context is started by using
 {% highlight scala %}
-ssc.start()
+val runningCounts = pairs.updateStateByKey[Int](updateFunction _)
 {% endhighlight %}
 
-Conversely, the computation can be stopped by using
-{% highlight scala %}
-ssc.stop()
+</div>
+<div data-lang="java" markdown="1">
+
+{% highlight java %}
+Function2<List<Integer>, Optional<Integer>, Optional<Integer>> updateFunction =
+  new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {
+    @Override public Optional<Integer> call(List<Integer> values, Optional<Integer> state) {
+      Integer newSum = ...  // add the new values with the previous running count to get the new count
+      return Optional.of(newSum)
+    }
+  }
 {% endhighlight %}
 
-# Example
-A simple example to start off is the [NetworkWordCount](https://github.com/apache/incubator-spark/tree/master/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala). This example counts the words received from a network server every second. Given below is the relevant sections of the source code. You can find the full source code in `<Spark repo>/streaming/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala` .
+This is applied on a DStream containing words (say, the `pairs` DStream containing `(word,
+1)` pairs in the [quick example](#a-quick-example)).
 
-{% highlight scala %}
-import org.apache.spark.streaming.{Seconds, StreamingContext}
-import StreamingContext._
-...
+{% highlight java %}
+JavaPairDStream<String, Integer> runningCounts = pairs.updateStateByKey(updateFunction);
+{% endhighlight %}
 
-// Create the context and set up a network input stream to receive from a host:port
-val ssc = new StreamingContext(args(0), "NetworkWordCount", Seconds(1))
-val lines = ssc.socketTextStream(args(1), args(2).toInt)
+</div>
+</div>
 
-// Split the lines into words, count them, and print some of the counts on the master
-val words = lines.flatMap(_.split(" "))
-val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
-wordCounts.print()
+The update function will be called for each word, with `newValues` having a sequence of 1's (from
+the `(word, 1)` pairs) and the `runningCount` having the previous count. For the complete
+Scala code, take a look at the example
+[StatefulNetworkWordCount]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/scala/org/apache/spark/streaming/examples/StatefulNetworkWordCount.scala).
 
-// Start the computation
-ssc.start()
-{% endhighlight %}
+<h4>Transform Operation</h4>
 
-The `socketTextStream` returns a DStream of text data received from a TCP server socket. The `lines` DStream is _transformed_ into a DStream using the `flatMap` operation, where each line is split into words. This `words` DStream is then mapped to a DStream of `(word, 1)` pairs, which is finally reduced to get the word counts. `wordCounts.print()` will print 10 of the counts generated every second.
+The `transform` operation (along with its variations like `transformWith`) allows
+arbitrary RDD-to-RDD functions to be applied on a DStream. It can be used to apply any RDD
+operation that is not exposed in the DStream API.
+For example, the functionality of joining every batch in a data stream
+with another dataset is not directly exposed in the DStream API. However,
+you can easily use `transform` to do this. This enables very powerful possibilities. For example,
+if you want to do real-time data cleaning by joining the input data stream with precomputed
+spam information (maybe generated with Spark as well) and then filtering based on it.
 
-To run this example on your local machine, you need to first run a Netcat server by using
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
 
-{% highlight bash %}
-$ nc -lk 9999
+{% highlight scala %}
+val spamInfoRDD = sparkContext.hadoopFile(...) // RDD containing spam information
+
+val cleanedDStream = inputDStream.transform(rdd => {
+  rdd.join(spamInfoRDD).filter(...) // join data stream with spam information to do data cleaning
+  ...
+})
 {% endhighlight %}
 
-Then, in a different terminal, you can start NetworkWordCount by using
+</div>
+<div data-lang="java" markdown="1">
 
-{% highlight bash %}
-$ ./bin/run-example org.apache.spark.streaming.examples.NetworkWordCount local[2] localhost 9999
+{% highlight java %}
+// RDD containing spam information
+JavaPairRDD<String, Double> spamInfoRDD = javaSparkContext.hadoopFile(...);
+
+JavaPairDStream<String, Integer> cleanedDStream = inputDStream.transform(
+  new Function<JavaPairRDD<String, Integer>, JavaPairRDD<String, Integer>>() {
+    @Override public JavaPairRDD<String, Integer> call(JavaPairRDD<String, Integer> rdd) throws Exception {
+      rdd.join(spamInfoRDD).filter(...) // join data stream with spam information to do data cleaning
+      ...
+    }
+  });
 {% endhighlight %}
 
-This will make NetworkWordCount connect to the netcat server. Any lines typed in the terminal running the netcat server will be counted and printed on screen.
+</div>
+</div>
 
-<table>
-<td>
-{% highlight bash %}
-# TERMINAL 1
-# RUNNING NETCAT
+In fact, you can also use [machine learning](mllib-guide.html) and
+[graph computation](graphx-programming-guide.html) algorithms in the `transform` method.
 
-$ nc -lk 9999
-hello world
+<h4>Window Operations</h4>
 
+Finally, Spark Streaming also provides *windowed computations*, which allow you to apply
+transformations over a sliding window of data. This following figure illustrates this sliding
+window.
 
+<p style="text-align: center;">
+  <img src="img/streaming-dstream-window.png"
+       title="Spark Streaming data flow"
+       alt="Spark Streaming"
+       width="60%" />
+</p>
 
+As shown in the figure, every time the window *slides* over a source DStream,
+the source RDDs that fall within the window are combined and operated upon to produce the
+RDDs of the windowed DStream. In this specific case, the operation is applied over last 3 time
+units of data, and slides by 2 time units. This shows that any window-based operation needs to
+specify two parameters.
 
+ * <i>window length</i> - The duration of the window (3 in the figure)
+ * <i>slide interval</i> - The interval at which the window-based operation is performed (2 in
+ the figure).
 
-...
+These two parameters must be multiples of the batch interval of the source DStream (1 in the
+figure).
+
+Let's illustrate the window operations with an example. Say, you want to extend the
+[earlier example](#a-quick-example) by generating word counts over last 30 seconds of data,
+every 10 seconds. To do this, we have to apply the `reduceByKey` operation on the `pairs` DStream of
+`(word, 1)` pairs over the last 30 seconds of data. This is done using the
+operation `reduceByKeyAndWindow`.
+
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
+
+{% highlight scala %}
+// Reduce last 30 seconds of data, every 10 seconds
+val windowedWordCounts = pairs.reduceByKeyAndWindow(_ + _, Seconds(30), Seconds(10))
 {% endhighlight %}
-</td>
-<td>
-{% highlight bash %}
-# TERMINAL 2: RUNNING NetworkWordCount
-...
--------------------------------------------
-Time: 1357008430000 ms
--------------------------------------------
-(hello,1)
-(world,1)
 
-...
+</div>
+<div data-lang="java" markdown="1">
+
+{% highlight java %}
+// Reduce function adding two integers, defined separately for clarity
+Function2<Integer, Integer, Integer> reduceFunc = new Function2<Integer, Integer, Integer>() {
+  @Override public Integer call(Integer i1, Integer i2) throws Exception {
+    return i1 + i2;
+  }
+};
+
+// Reduce last 30 seconds of data, every 10 seconds
+JavaPairDStream<String, Integer> windowedWordCounts = pair.reduceByKeyAndWindow(reduceFunc, new Duration(30000), new Duration(10000));
 {% endhighlight %}
-</td>
-</table>
 
-You can find more examples in `<Spark repo>/streaming/src/main/scala/org/apache/spark/streaming/examples/`. They can be run in the similar manner using `./bin/run-example org.apache.spark.streaming.examples....` . Executing without any parameter would give the required parameter list. Further explanation to run them can be found in comments in the files.
+</div>
+</div>
 
-# DStream Persistence
-Similar to RDDs, DStreams also allow developers to persist the stream's data in memory. That is, using `persist()` method on a DStream would automatically persist every RDD of that DStream in memory. This is useful if the data in the DStream will be computed multiple times (e.g., multiple operations on the same data). For window-based operations like `reduceByWindow` and `reduceByKeyAndWindow` and state-based operations like `updateStateByKey`, this is implicitly true. Hence, DStreams generated by window-based operations are automatically persisted in memory, without the developer calling `persist()`.
+Some of the common window-based operations are as follows. All of these operations take the
+said two parameters - <i>windowLength</i> and <i>slideInterval</i>.
 
-For input streams that receive data from the network (that is, subclasses of NetworkInputDStream like FlumeInputDStream and KafkaInputDStream), the default persistence level is set to replicate the data to two nodes for fault-tolerance.
+<table class="table">
+<tr><th style="width:25%">Transformation</th><th>Meaning</th></tr>
+<tr>
+  <td> <b>window</b>(<i>windowLength</i>, <i>slideInterval</i>) </td>
+  <td> Return a new DStream which is computed based on windowed batches of the source DStream.
+  </td>
+</tr>
+<tr>
+  <td> <b>countByWindow</b>(<i>windowLength</i>, <i>slideInterval</i>) </td>
+  <td> Return a sliding window count of elements in the stream.
+  </td>
+</tr>
+<tr>
+  <td> <b>reduceByWindow</b>(<i>func</i>, <i>windowLength</i>, <i>slideInterval</i>) </td>
+  <td> Return a new single-element stream, created by aggregating elements in the stream over a
+  sliding interval using <i>func</i>. The function should be associative so that it can be computed
+  correctly in parallel.
+  </td>
+</tr>
+<tr>
+  <td> <b>reduceByKeyAndWindow</b>(<i>func</i>, <i>windowLength</i>, <i>slideInterval</i>,
+  [<i>numTasks</i>]) </td>
+  <td> When called on a DStream of (K, V) pairs, returns a new DStream of (K, V)
+  pairs where the values for each key are aggregated using the given reduce function <i>func</i>
+  over batches in a sliding window. <b>Note:</b> By default, this uses Spark's default number of
+  parallel tasks (2 for local machine, 8 for a cluster) to do the grouping. You can pass an optional
+   <code>numTasks</code> argument to set a different number of tasks.
+  </td>
+</tr>
+<tr>
+  <td> <b>reduceByKeyAndWindow</b>(<i>func</i>, <i>invFunc</i>, <i>windowLength</i>,
+  <i>slideInterval</i>, [<i>numTasks</i>]) </td>
+  <td> A more efficient version of the above <code>reduceByKeyAndWindow()</code> where the reduce
+  value of each window is calculated incrementally using the reduce values of the previous window.
+  This is done by reducing the new data that enter the sliding window, and "inverse reducing" the
+  old data that leave the window. An example would be that of "adding" and "subtracting" counts
+  of keys as the window slides. However, it is applicable to only "invertible reduce functions",
+  that is, those reduce functions which have a corresponding "inverse reduce" function (taken as
+  parameter <i>invFunc</i>. Like in <code>reduceByKeyAndWindow</code>, the number of reduce tasks
+  is configurable through an optional argument.
+</td>
+</tr>
+<tr>
+  <td> <b>countByValueAndWindow</b>(<i>windowLength</i>,
+  <i>slideInterval</i>, [<i>numTasks</i>]) </td>
+  <td> When called on a DStream of (K, V) pairs, returns a new DStream of (K, Long) pairs where the
+  value of each key is its frequency within a sliding window. Like in
+  <code>reduceByKeyAndWindow</code>, the number of reduce tasks is configurable through an
+  optional argument.
+</td>
+</tr>
+<tr><td></td><td></td></tr>
+</table>
 
-Note that, unlike RDDs, the default persistence level of DStreams keeps the data serialized in memory. This is further discussed in the [Performance Tuning](#memory-tuning) section. More information on different persistence levels can be found in [Spark Programming Guide](scala-programming-guide.html#rdd-persistence).
+### Output Operations
+When an output operator is called, it triggers the computation of a stream. Currently the following
+output operators are defined:
 
-# RDD Checkpointing within DStreams
-A _stateful operation_ is one which operates over multiple batches of data. This includes all window-based operations and the `updateStateByKey` operation. 
+<table class="table">
+<tr><th style="width:30%">Output Operation</th><th>Meaning</th></tr>
+<tr>
+  <td> <b>print</b>() </td>
+  <td> Prints first ten elements of every batch of data in a DStream on the driver. </td>
+</tr>
+<tr>
+  <td> <b>foreachRDD</b>(<i>func</i>) </td>
+  <td> The fundamental output operator. Applies a function, <i>func</i>, to each RDD generated from
+  the stream. This function should have side effects, such as printing output, saving the RDD to
+  external files, or writing it over the network to an external system. </td>
+</tr>
+<tr>
+  <td> <b>saveAsObjectFiles</b>(<i>prefix</i>, [<i>suffix</i>]) </td>
+  <td> Save this DStream's contents as a <code>SequenceFile</code> of serialized objects. The file
+  name at each batch interval is generated based on <i>prefix</i> and
+  <i>suffix</i>: <i>"prefix-TIME_IN_MS[.suffix]"</i>.
+  </td>
+</tr>
+<tr>
+  <td> <b>saveAsTextFiles</b>(<i>prefix</i>, [<i>suffix</i>]) </td>
+  <td> Save this DStream's contents as a text files. The file name at each batch interval is
+  generated based on <i>prefix</i> and <i>suffix</i>: <i>"prefix-TIME_IN_MS[.suffix]"</i>. </td>
+</tr>
+<tr>
+  <td> <b>saveAsHadoopFiles</b>(<i>prefix</i>, [<i>suffix</i>]) </td>
+  <td> Save this DStream's contents as a Hadoop file. The file name at each batch interval is
+  generated based on <i>prefix</i> and <i>suffix</i>: <i>"prefix-TIME_IN_MS[.suffix]"</i>. </td>
+</tr>
+<tr><td></td><td></td></tr>
+</table>
 
-Because stateful operations have a dependency on previous batches of data, they continuously accumulate metadata over time. To clear this metadata, streaming supports periodic _checkpointing_ by saving intermediate data to HDFS. Note that checkpointing also incurs the cost of saving to HDFS which may cause the corresponding batch to take longer to process. Hence, the interval of checkpointing needs to be set carefully. At small batch sizes (say 1 second), checkpointing every batch may significantly reduce operation throughput. Conversely, checkpointing too slowly causes the lineage and task sizes to grow which may have detrimental effects. Typically, a checkpoint interval of 5 - 10 times of sliding interval of a DStream is good setting to try.
 
-To enable checkpointing, the developer has to provide the HDFS path to which RDD will be saved. This is done by using
+The complete list of DStream operations is available in the API documentation. For the Scala API,
+see [DStream](api/streaming/index.html#org.apache.spark.streaming.dstream.DStream)
+and [PairDStreamFunctions](api/streaming/index.html#org.apache.spark.streaming.dstream.PairDStreamFunctions).
+For the Java API, see [JavaDStream](api/streaming/index.html#org.apache.spark.streaming.api.java.dstream.DStream)
+and [JavaPairDStream](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaPairDStream).
+Specifically for the Java API, see [Spark's Java programming guide](java-programming-guide.html)
+for more information.
+
+## Persistence
+Similar to RDDs, DStreams also allow developers to persist the stream's data in memory. That is,
+using `persist()` method on a DStream would automatically persist every RDD of that DStream in
+memory. This is useful if the data in the DStream will be computed multiple times (e.g., multiple
+operations on the same data). For window-based operations like `reduceByWindow` and
+`reduceByKeyAndWindow` and state-based operations like `updateStateByKey`, this is implicitly true.
+Hence, DStreams generated by window-based operations are automatically persisted in memory, without
+the developer calling `persist()`.
+
+For input streams that receive data over the network (such as, Kafka, Flume, sockets, etc.), the
+default persistence level is set to replicate the data to two nodes for fault-tolerance.
+
+Note that, unlike RDDs, the default persistence level of DStreams keeps the data serialized in
+memory. This is further discussed in the [Performance Tuning](#memory-tuning) section. More
+information on different persistence levels can be found in
+[Spark Programming Guide](scala-programming-guide.html#rdd-persistence).
+
+## RDD Checkpointing
+A _stateful operation_ is one which operates over multiple batches of data. This includes all
+window-based operations and the `updateStateByKey` operation. Since stateful operations have a
+dependency on previous batches of data, they continuously accumulate metadata over time.
+To clear this metadata, streaming supports periodic _checkpointing_ by saving intermediate data
+to HDFS. Note that checkpointing also incurs the cost of saving to HDFS which may cause the
+corresponding batch to take longer to process. Hence, the interval of checkpointing needs to be
+set carefully. At small batch sizes (say 1 second), checkpointing every batch may significantly
+reduce operation throughput. Conversely, checkpointing too slowly causes the lineage and task
+sizes to grow which may have detrimental effects. Typically, a checkpoint interval of 5 - 10
+times of sliding interval of a DStream is good setting to try.
+
+To enable checkpointing, the developer has to provide the HDFS path to which RDD will be saved.
+This is done by using
 
 {% highlight scala %}
-ssc.checkpoint(hdfsPath) // assuming ssc is the StreamingContext
+ssc.checkpoint(hdfsPath) // assuming ssc is the StreamingContext or JavaStreamingContext
 {% endhighlight %}
 
 The interval of checkpointing of a DStream can be set by using
 
 {% highlight scala %}
-dstream.checkpoint(checkpointInterval) // checkpointInterval must be a multiple of slide duration of dstream
+dstream.checkpoint(checkpointInterval)
 {% endhighlight %}
 
-For DStreams that must be checkpointed (that is, DStreams created by `updateStateByKey` and `reduceByKeyAndWindow` with inverse function), the checkpoint interval of the DStream is by default set to a multiple of the DStream's sliding interval such that its at least 10 seconds.
+For DStreams that must be checkpointed (that is, DStreams created by `updateStateByKey` and
+`reduceByKeyAndWindow` with inverse function), the checkpoint interval of the DStream is by
+default set to a multiple of the DStream's sliding interval such that its at least 10 seconds.
 
-
-## Custom Receivers
-Spark comes with a built in support for most common usage scenarios where input stream source can be either a network socket stream to support for a few message queues. Apart from that it is also possible to supply your own custom receiver via a convenient API. Find more details at [Custom Receiver Guide](streaming-custom-receivers.html).
+***************************************************************************************************  
 
 # Performance Tuning
-Getting the best performance of a Spark Streaming application on a cluster requires a bit of tuning. This section explains a number of the parameters and configurations that can tuned to improve the performance of you application. At a high level, you need to consider two things:
+Getting the best performance of a Spark Streaming application on a cluster requires a bit of
+tuning. This section explains a number of the parameters and configurations that can tuned to
+improve the performance of you application. At a high level, you need to consider two things:
+
 <ol>
-<li>Reducing the processing time of each batch of data by efficiently using cluster resources.</li>
-<li>Setting the right batch size such that the data processing can keep up with the data ingestion.</li>
+<li>
+  Reducing the processing time of each batch of data by efficiently using cluster resources.
+</li>
+<li>
+  Setting the right batch size such that the data processing can keep up with the data ingestion.
+</li>
 </ol>
 
 ## Reducing the Processing Time of each Batch
-There are a number of optimizations that can be done in Spark to minimize the processing time of each batch. These have been discussed in detail in [Tuning Guide](tuning.html). This section highlights some of the most important ones.
+There are a number of optimizations that can be done in Spark to minimize the processing time of
+each batch. These have been discussed in detail in [Tuning Guide](tuning.html). This section
+highlights some of the most important ones.
 
 ### Level of Parallelism
-Cluster resources maybe under-utilized if the number of parallel tasks used in any stage of the computation is not high enough. For example, for distributed reduce operations like `reduceByKey` and `reduceByKeyAndWindow`, the default number of parallel tasks is 8. You can pass the level of parallelism as an argument (see the [`PairDStreamFunctions`](api/streaming/index.html#org.apache.spark.PairDStreamFunctions) documentation), or set the [config property](configuration.html#spark-properties) `spark.default.parallelism` to change the default.
+Cluster resources maybe under-utilized if the number of parallel tasks used in any stage of the
+computation is not high enough. For example, for distributed reduce operations like `reduceByKey`
+and `reduceByKeyAndWindow`, the default number of parallel tasks is 8. You can pass the level of
+parallelism as an argument (see the
+[`PairDStreamFunctions`](api/streaming/index.html#org.apache.spark.streaming.dstream.PairDStreamFunctions)
+documentation), or set the [config property](configuration.html#spark-properties)
+`spark.default.parallelism` to change the default.
 
 ### Data Serialization
-The overhead of data serialization can be significant, especially when sub-second batch sizes are to be achieved. There are two aspects to it.
+The overhead of data serialization can be significant, especially when sub-second batch sizes are
+ to be achieved. There are two aspects to it.
 
-* **Serialization of RDD data in Spark**: Please refer to the detailed discussion on data serialization in the [Tuning Guide](tuning.html). However, note that unlike Spark, by default RDDs are persisted as serialized byte arrays to minimize pauses related to GC.
+* **Serialization of RDD data in Spark**: Please refer to the detailed discussion on data
+  serialization in the [Tuning Guide](tuning.html). However, note that unlike Spark, by default
+  RDDs are persisted as serialized byte arrays to minimize pauses related to GC.
 
-* **Serialization of input data**: To ingest external data into Spark, data received as bytes (say, from the network) needs to deserialized from bytes and re-serialized into Spark's serialization format. Hence, the deserialization overhead of input data may be a bottleneck.
+* **Serialization of input data**: To ingest external data into Spark, data received as bytes
+  (say, from the network) needs to deserialized from bytes and re-serialized into Spark's
+  serialization format. Hence, the deserialization overhead of input data may be a bottleneck.
 
 ### Task Launching Overheads
-If the number of tasks launched per second is high (say, 50 or more per second), then the overhead of sending out tasks to the slaves maybe significant and will make it hard to achieve sub-second latencies. The overhead can be reduced by the following changes:
+If the number of tasks launched per second is high (say, 50 or more per second), then the overhead
+of sending out tasks to the slaves maybe significant and will make it hard to achieve sub-second
+latencies. The overhead can be reduced by the following changes:
 
-* **Task Serialization**: Using Kryo serialization for serializing tasks can reduced the task sizes, and therefore reduce the time taken to send them to the slaves.
+* **Task Serialization**: Using Kryo serialization for serializing tasks can reduced the task
+  sizes, and therefore reduce the time taken to send them to the slaves.
 
-* **Execution mode**: Running Spark in Standalone mode or coarse-grained Mesos mode leads to better task launch times than the fine-grained Mesos mode. Please refer to the [Running on Mesos guide](running-on-mesos.html) for more details.
-These changes may reduce batch processing time by 100s of milliseconds, thus allowing sub-second batch size to be viable.
+* **Execution mode**: Running Spark in Standalone mode or coarse-grained Mesos mode leads to
+  better task launch times than the fine-grained Mesos mode. Please refer to the
+  [Running on Mesos guide](running-on-mesos.html) for more details.
 
-## Setting the Right Batch Size
-For a Spark Streaming application running on a cluster to be stable, the processing of the data streams must keep up with the rate of ingestion of the data streams. Depending on the type of computation, the batch size used may have significant impact on the rate of ingestion that can be sustained by the Spark Streaming application on a fixed cluster resources. For example, let us consider the earlier WordCountNetwork example. For a particular data rate, the system may be able to keep up with reporting word counts every 2 seconds (i.e., batch size of 2 seconds), but not every 500 milliseconds.
+These changes may reduce batch processing time by 100s of milliseconds,
+thus allowing sub-second batch size to be viable.
 
-A good approach to figure out the right batch size for your application is to test it with a conservative batch size (say, 5-10 seconds) and a low data rate. To verify whether the system is able to keep up with data rate, you can check the value of the end-to-end delay experienced by each processed batch (in the Spark master logs, find the line having the phrase "Total delay"). If the delay is maintained to be less than the batch size, then system is stable. Otherwise, if the delay is continuously increasing, it means that the system is unable to keep up and it therefore unstable. Once you have an idea of a stable configuration, you can try increasing the data rate and/or reducing the batch size. Note that momentary increase in the delay due to temporary data rate increases maybe fine as long as the delay reduces back to a low value (i.e., less than batch size).
+## Setting the Right Batch Size
+For a Spark Streaming application running on a cluster to be stable, the processing of the data
+streams must keep up with the rate of ingestion of the data streams. Depending on the type of
+computation, the batch size used may have significant impact on the rate of ingestion that can be
+sustained by the Spark Streaming application on a fixed cluster resources. For example, let us
+consider the earlier WordCountNetwork example. For a particular data rate, the system may be able
+to keep up with reporting word counts every 2 seconds (i.e., batch size of 2 seconds), but not
+every 500 milliseconds.
+
+A good approach to figure out the right batch size for your application is to test it with a
+conservative batch size (say, 5-10 seconds) and a low data rate. To verify whether the system
+is able to keep up with data rate, you can check the value of the end-to-end delay experienced
+by each processed batch (either look for "Total delay" in Spark driver log4j logs, or use the
+[StreamingListener](api/streaming/index.html#org.apache.spark.streaming.scheduler.StreamingListener)
+interface).
+If the delay is maintained to be comparable to the batch size, then system is stable. Otherwise,
+if the delay is continuously increasing, it means that the system is unable to keep up and it
+therefore unstable. Once you have an idea of a stable configuration, you can try increasing the
+data rate and/or reducing the batch size. Note that momentary increase in the delay due to
+temporary data rate increases maybe fine as long as the delay reduces back to a low value
+(i.e., less than batch size).
 
 ## 24/7 Operation
-By default, Spark does not forget any of the metadata (RDDs generated, stages processed, etc.). But for a Spark Streaming application to operate 24/7, it is necessary for Spark to do periodic cleanup of it metadata. This can be enabled by setting the [config property](configuration.html#spark-properties) `spark.cleaner.ttl` to the number of seconds you want any metadata to persist. For example, setting `spark.cleaner.ttl` to 600 would cause Spark periodically cleanup all metadata and persisted RDDs that are older than 10 minutes. Note, that this property needs to be set before the SparkContext is created.
-
-This value is closely tied with any window operation that is being used. Any window operation would require the input data to be persisted in memory for at least the duration of the window. Hence it is necessary to set the delay to at least the value of the largest window operation used in the Spark Streaming application. If this delay is set too low, the application will throw an exception saying so.
+By default, Spark does not forget any of the metadata (RDDs generated, stages processed, etc.).
+But for a Spark Streaming application to operate 24/7, it is necessary for Spark to do periodic
+cleanup of it metadata. This can be enabled by setting the
+[configuration property](configuration.html#spark-properties) `spark.cleaner.ttl` to the number of
+seconds you want any metadata to persist. For example, setting `spark.cleaner.ttl` to 600 would
+cause Spark periodically cleanup all metadata and persisted RDDs that are older than 10 minutes.
+Note, that this property needs to be set before the SparkContext is created.
+
+This value is closely tied with any window operation that is being used. Any window operation
+would require the input data to be persisted in memory for at least the duration of the window.
+Hence it is necessary to set the delay to at least the value of the largest window operation used
+in the Spark Streaming application. If this delay is set too low, the application will throw an
+exception saying so.
+
+## Monitoring
+Besides Spark's in-built [monitoring capabilities](monitoring.html),
+the progress of a Spark Streaming program can also be monitored using the [StreamingListener]
+(streaming/index.html#org.apache.spark.scheduler.StreamingListener) interface,
+which allows you to get statistics of batch processing times, queueing delays,
+and total end-to-end delays. Note that this is still an experimental API and it is likely to be
+improved upon (i.e., more information reported) in the future.
 
 ## Memory Tuning
-Tuning the memory usage and GC behavior of Spark applications have been discussed in great detail in the [Tuning Guide](tuning.html). It is recommended that you read that. In this section, we highlight a few customizations that are strongly recommended to minimize GC related pauses in Spark Streaming applications and achieving more consistent batch processing times.
-
-* **Default persistence level of DStreams**: Unlike RDDs, the default persistence level of DStreams serializes the data in memory (that is, [StorageLevel.MEMORY_ONLY_SER](api/core/index.html#org.apache.spark.storage.StorageLevel$) for DStream compared to [StorageLevel.MEMORY_ONLY](api/core/index.html#org.apache.spark.storage.StorageLevel$) for RDDs). Even though keeping the data serialized incurs a higher serialization overheads, it significantly reduces GC pauses.
-
-* **Concurrent garbage collector**: Using the concurrent mark-and-sweep GC further minimizes the variability of GC pauses. Even though concurrent GC is known to reduce the overall processing throughput of the system, its use is still recommended to achieve more consistent batch processing times.
+Tuning the memory usage and GC behavior of Spark applications have been discussed in great detail
+in the [Tuning Guide](tuning.html). It is recommended that you read that. In this section,
+we highlight a few customizations that are strongly recommended to minimize GC related pauses
+in Spark Streaming applications and achieving more consistent batch processing times.
+
+* **Default persistence level of DStreams**: Unlike RDDs, the default persistence level of DStreams
+serializes the data in memory (that is,
+[StorageLevel.MEMORY_ONLY_SER](api/core/index.html#org.apache.spark.storage.StorageLevel$) for
+DStream compared to
+[StorageLevel.MEMORY_ONLY](api/core/index.html#org.apache.spark.storage.StorageLevel$) for RDDs).
+Even though keeping the data serialized incurs higher serialization/deserialization overheads,
+it significantly reduces GC pauses.
+
+* **Clearing persistent RDDs**: By default, all persistent RDDs generated by Spark Streaming will
+ be cleared from memory based on Spark's in-built policy (LRU). If `spark.cleaner.ttl` is set,
+ then persistent RDDs that are older than that value are periodically cleared. As mentioned
+ [earlier](#operation), this needs to be careful set based on operations used in the Spark
+ Streaming program. However, a smarter unpersisting of RDDs can be enabled by setting the
+ [configuration property](configuration.html#spark-properties) `spark.streaming.unpersist` to
+ `true`. This makes the system to figure out which RDDs are not necessary to be kept around and
+ unpersists them. This is likely to reduce
+ the RDD memory usage of Spark, potentially improving GC behavior as well.
+
+* **Concurrent garbage collector**: Using the concurrent mark-and-sweep GC further
+minimizes the variability of GC pauses. Even though concurrent GC is known to reduce the
+overall processing throughput of the system, its use is still recommended to achieve more
+consistent batch processing times.
+
+***************************************************************************************************  
 
 # Fault-tolerance Properties
-In this section, we are going to discuss the behavior of Spark Streaming application in the event of a node failure. To understand this, let us remember the basic fault-tolerance properties of Spark's RDDs.
+In this section, we are going to discuss the behavior of Spark Streaming application in the event
+of a node failure. To understand this, let us remember the basic fault-tolerance properties of
+Spark's RDDs.
 
- 1. An RDD is an immutable, and deterministically re-computable, distributed dataset. Each RDD remembers the lineage of deterministic operations that were used on a fault-tolerant input dataset to create it.
- 1. If any partition of an RDD is lost due to a worker node failure, then that partition can be re-computed from the original fault-tolerant dataset using the lineage of operations.
+ 1. An RDD is an immutable, deterministically re-computable, distributed dataset. Each RDD
+ remembers the lineage of deterministic operations that were used on a fault-tolerant input
+ dataset to create it.
+ 1. If any partition of an RDD is lost due to a worker node failure, then that partition can be
+ re-computed from the original fault-tolerant dataset using the lineage of operations.
 
-Since all data transformations in Spark Streaming are based on RDD operations, as long as the input dataset is present, all intermediate data can recomputed. Keeping these properties in mind, we are going to discuss the failure semantics in more detail.
+Since all data transformations in Spark Streaming are based on RDD operations, as long as the input
+dataset is present, all intermediate data can recomputed. Keeping these properties in mind, we are
+going to discuss the failure semantics in more detail.
 
 ## Failure of a Worker Node
-
 There are two failure behaviors based on which input sources are used.
 
-1. _Using HDFS files as input source_ - Since the data is reliably stored on HDFS, all data can re-computed and therefore no data will be lost due to any failure.
-1. _Using any input source that receives data through a network_ - For network-based data sources like Kafka and Flume, the received input data is replicated in memory between nodes of the cluster (default replication factor is 2). So if a worker node fails, then the system can recompute the lost from the the left over copy of the input data. However, if the worker node where a network receiver was running fails, then a tiny bit of data may be lost, that is, the data received by the system but not yet replicated to other node(s). The receiver will be started on a different node and it will continue to receive data.
-
-Since all data is modeled as RDDs with their lineage of deterministic operations, any recomputation always leads to the same result. As a result, all DStream transformations are guaranteed to have _exactly-once_ semantics. That is, the final transformed result will be same even if there were was a worker node failure. However, output operations (like `foreachRDD`) have _at-least once_ semantics, that is, the transformed data may get written to an external entity more than once in the event of a worker failure. While this is acceptable for saving to HDFS using the `saveAs*Files` operations (as the file will simply get over-written by the same data), additional transactions-like mechanisms may be necessary to achieve exactly-once semantics for output operations.
+1. _Using HDFS files as input source_ - Since the data is reliably stored on HDFS, all data can
+re-computed and therefore no data will be lost due to any failure.
+1. _Using any input source that receives data through a network_ - For network-based data sources
+like Kafka and Flume, the received input data is replicated in memory between nodes of the cluster
+(default replication factor is 2). So if a worker node fails, then the system can recompute the
+lost from the the left over copy of the input data. However, if the worker node where a network
+receiver was running fails, then a tiny bit of data may be lost, that is, the data received by
+the system but not yet replicated to other node(s). The receiver will be started on a different
+node and it will continue to receive data.
+
+Since all data is modeled as RDDs with their lineage of deterministic operations, any recomputation
+ always leads to the same result. As a result, all DStream transformations are guaranteed to have
+ _exactly-once_ semantics. That is, the final transformed result will be same even if there were
+ was a worker node failure. However, output operations (like `foreachRDD`) have _at-least once_
+ semantics, that is, the transformed data may get written to an external entity more than once in
+ the event of a worker failure. While this is acceptable for saving to HDFS using the
+ `saveAs*Files` operations (as the file will simply get over-written by the same data),
+ additional transactions-like mechanisms may be necessary to achieve exactly-once semantics
+ for output operations.
 
 ## Failure of the Driver Node
-A system that is required to operate 24/7 needs to be able tolerate the failure of the driver node as well. Spark Streaming does this by saving the state of the DStream computation periodically to a HDFS file, that can be used to restart the streaming computation in the event of a failure of the driver node. This checkpointing is enabled by setting a HDFS directory for checkpointing using `ssc.checkpoint(<checkpoint directory>)` as described [earlier](#rdd-checkpointing-within-dstreams). To elaborate, the following state is periodically saved to a file.
+To allows a streaming application to operate 24/7, Spark Streaming allows a streaming computation
+to be resumed even after the failure of the driver node. Spark Streaming periodically writes the
+metadata information of the DStreams setup through the `StreamingContext` to a
+HDFS directory (can be any Hadoop-compatible filesystem). This periodic
+*checkpointing* can be enabled by setting a the checkpoint
+directory using `ssc.checkpoint(<checkpoint directory>)` as described
+[earlier](#rdd-checkpointing). On failure of the driver node,
+the lost `StreamingContext` can be recovered from this information, and restarted.
+
+To allow a Spark Streaming program to be recoverable, it must be written in a way such that
+it has the following behavior:
+
+1.  When the program is being started for the first time, it will create a new StreamingContext,
+    set up all the streams and then call start().
+1.  When the program is being restarted after failure, it will re-create a StreamingContext
+    from the checkpoint data in the checkpoint directory.
 
-1. The DStream operator graph (input streams, output streams, etc.)
-1. The configuration of each DStream (checkpoint interval, etc.)
-1. The RDD checkpoint files of each DStream
+<div class="codetabs">
+<div data-lang="scala" markdown="1">
 
-All this is periodically saved in the checkpoint directory. To recover, a new `StreamingContext` can be created with this directory by using
+This behavior is made simple by using `StreamingContext.getOrCreate`. This is used as follows.
 
 {% highlight scala %}
-val ssc = new StreamingContext(checkpointDirectory)
+// Function to create and setup a new StreamingContext
+def functionToCreateContext(): StreamingContext = {
+    val ssc = new StreamingContext(...)   // new context
+    val lines = ssc.socketTextStream(...) // create DStreams
+    ...
+    ssc.checkpoint(checkpointDirectory)   // set checkpoint directory
+    ssc
+}
+
+// Get StreaminContext from checkpoint data or create a new one
+val context = StreamingContext.getOrCreate(checkpointDirectory, functionToCreateContext _)
+
+// Do additional setup on context that needs to be done,
+// irrespective of whether it is being started or restarted
+context. ...
+
+// Start the context
+context.start()
+context.awaitTermination()
 {% endhighlight %}
 
-On calling `ssc.start()` on this new context, the following steps are taken by the system
+If the `checkpointDirectory` exists, then the context will be recreated from the checkpoint data.
+If the directory does not exist (i.e., running for the first time),
+then the function `functionToCreateContext` will be called to create a new
+context and set up the DStreams. See the Scala example
+[RecoverableNetworkWordCount]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/streaming/examples/RecoverableNetworkWordCount.scala).
+This example appends the word counts of network data into a file.
 
-1. Schedule the transformations and output operations for all the time steps between the time when the driver failed and when it last checkpointed. This is also done for those time steps that were previously scheduled but not processed due to the failure. This will make the system recompute all the intermediate data from the checkpointed RDD files, etc.
-1. Restart the network receivers, if any, and continue receiving new data.
+You can also explicitly create a `StreamingContext` from the checkpoint data and start the
+ computation by using `new StreamingContext(checkpointDirectory)`.
 
-There are two different failure behaviors based on which input sources are used.
+</div>
+<div data-lang="java" markdown="1">
 
-1. _Using HDFS files as input source_ - Since the data is reliably stored on HDFS, all data can re-computed and therefore no data will be lost due to any failure.
-1. _Using any input source that receives data through a network_ - The received input data is replicated in memory to multiple nodes. Since, all the data in the Spark worker's memory is lost when the Spark driver fails, the past input data will not be accessible and driver recovers. Hence, if stateful and window-based operations are used (like `updateStateByKey`, `window`, `countByValueAndWindow`, etc.), then the intermediate state will not be recovered completely.
+This behavior is made simple by using `JavaStreamingContext.getOrCreate`. This is used as follows.
 
-In future releases, we will support full recoverability for all input sources. Note that for non-stateful transformations like `map`, `count`, and `reduceByKey`, with _all_ input streams, the system, upon restarting, will continue to receive and process new data.
+{% highlight java %}
+// Create a factory object that can create a and setup a new JavaStreamingContext
+JavaStreamingContextFactory contextFactory = new JavaStreamingContextFactory() {
+  JavaStreamingContextFactory create() {
+    JavaStreamingContext jssc = new JavaStreamingContext(...);  // new context
+    JavaDStream<String> lines = jssc.socketTextStream(...);     // create DStreams
+    ...
+    jssc.checkpoint(checkpointDirectory);                       // set checkpoint directory
+    return jssc;
+  }
+};
+
+// Get JavaStreamingContext from checkpoint data or create a new one
+JavaStreamingContext context = JavaStreamingContext.getOrCreate(checkpointDirectory, contextFactory);
+
+// Do additional setup on context that needs to be done,
+// irrespective of whether it is being started or restarted
+context. ...
+
+// Start the context
+context.start();
+context.awaitTermination();
+{% endhighlight %}
 
-To better understand the behavior of the system under driver failure with a HDFS source, lets consider what will happen with a file input stream Specifically, in the case of the file input stream, it will correctly identify new files that were created while the driver was down and process them in the same way as it would have if the driver had not failed. To explain further in the case of file input stream, we shall use an example. Lets say, files are being generated every second, and a Spark Streaming program reads every new file and output the number of lines in the file. This is what the sequence of outputs would be with and without a driver failure.
+If the `checkpointDirectory` exists, then the context will be recreated from the checkpoint data.
+If the directory does not exist (i.e., running for the first time),
+then the function `contextFactory` will be called to create a new
+context and set up the DStreams. See the Scala example
+[JavaRecoverableWordCount]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/streaming/examples/JavaRecoverableWordCount.scala)
+(note that this example is missing in the 0.9 release, so you can test it using the master branch).
+This example appends the word counts of network data into a file.
+
+You can also explicitly create a `JavaStreamingContext` from the checkpoint data and start
+the computation by using `new JavaStreamingContext(checkpointDirectory)`.
+
+</div>
+</div>
+
+**Note**: If Spark Streaming and/or the Spark Streaming program is recompiled,
+you *must* create a new `StreamingContext` or `JavaStreamingContext`,
+not recreate from checkpoint data. This is because trying to load a
+context from checkpoint data may fail if the data was generated before recompilation of the
+classes. So, if you are using `getOrCreate`, then make sure that the checkpoint directory is
+explicitly deleted every time recompiled code needs to be launched.
+
+This failure recovery can be done automatically using Spark's
+[standalone cluster mode](spark-standalone.html), which allows any Spark
+application's driver to be as well as, ensures automatic restart of the driver on failure (see
+[supervise mode](spark-standalone.html#launching-applications-inside-the-cluster)). This can be
+tested locally by launching the above example using the supervise mode in a
+local standalone cluster and killing the java process running the driver (will be shown as
+*DriverWrapper* when `jps` is run to show all active Java processes). The driver should be
+automatically restarted, and the word counts will cont
+
+For other deployment environments like Mesos and Yarn, you have to restart the driver through other
+mechanisms.
+
+<h4>Recovery Semantics</h4>
+
+There are two different failure behaviors based on which input sources are used.
+
+1. _Using HDFS files as input source_ - Since the data is reliably stored on HDFS, all data can
+re-computed and therefore no data will be lost due to any failure.
+1. _Using any input source that receives data through a network_ - The received input data is
+replicated in memory to multiple nodes. Since, all the data in the Spark worker's memory is lost
+when the Spark driver fails, the past input data will not be accessible and driver recovers.
+Hence, if stateful and window-based operations are used
+(like `updateStateByKey`, `window`, `countByValueAndWindow`, etc.), then the intermediate state
+will not be recovered completely.
+
+In future releases, we will support full recoverability for all input sources. Note that for
+non-stateful transformations like `map`, `count`, and `reduceByKey`, with _all_ input streams,
+the system, upon restarting, will continue to receive and process new data.
+
+To better understand the behavior of the system under driver failure with a HDFS source, lets
+consider what will happen with a file input stream. Specifically, in the case of the file input
+stream, it will correctly identify new files that were created while the driver was down and
+process them in the same way as it would have if the driver had not failed. To explain further
+in the case of file input stream, we shall use an example. Lets say, files are being generated
+every second, and a Spark Streaming program reads every new file and output the number of lines
+in the file. This is what the sequence of outputs would be with and without a driver failure.
 
 <table class="table">
     <!-- Results table headers -->
@@ -476,58 +1211,21 @@ To better understand the behavior of the system under driver failure with a HDFS
     </tr>
 </table>
 
-If the driver had crashed in the middle of the processing of time 3, then it will process time 3 and output 30 after recovery.
-
-# Java API
-
-Similar to [Spark's Java API](java-programming-guide.html), we also provide a Java API for Spark Streaming which allows all its features to be accessible from a Java program. This is defined in [org.apache.spark.streaming.api.java] (api/streaming/index.html#org.apache.spark.streaming.api.java.package) package and includes [JavaStreamingContext](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaStreamingContext) and [JavaDStream](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaDStream) classes that provide the same methods as their Scala counterparts, but take Java functions (that is, Function, and Function2) and return Java data and collection types. Some of the key points to note are:
-
-1. Functions for transformations must be implemented as subclasses of [Function](api/core/index.html#org.apache.spark.api.java.function.Function) and [Function2](api/core/index.html#org.apache.spark.api.java.function.Function2)
-1. Unlike the Scala API, the Java API handles DStreams for key-value pairs using a separate [JavaPairDStream](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaPairDStream) class(similar to [JavaRDD and JavaPairRDD](java-programming-guide.html#rdd-classes). DStream functions like `map` and `filter` are implemented separately by JavaDStreams and JavaPairDStream to return DStreams of appropriate types.
-
-Spark's [Java Programming Guide](java-programming-guide.html) gives more ideas about using the Java API. To extends the ideas presented for the RDDs to DStreams, we present parts of the Java version of the same NetworkWordCount example presented above. The full source code is given at `<spark repo>/examples/src/main/java/org/apache/spark/streaming/examples/JavaNetworkWordCount.java`
-
-The streaming context and the socket stream from input source is started by using a `JavaStreamingContext`, that has the same parameters and provides the same input streams as its Scala counterpart.
-
-{% highlight java %}
-JavaStreamingContext ssc = new JavaStreamingContext(mesosUrl, "NetworkWordCount", Seconds(1));
-JavaDStream<String> lines = ssc.socketTextStream(ip, port);
-{% endhighlight %}
-
-
-Then the `lines` are split into words by using the `flatMap` function and [FlatMapFunction](api/core/index.html#org.apache.spark.api.java.function.FlatMapFunction).
-
-{% highlight java %}
-JavaDStream<String> words = lines.flatMap(
-  new FlatMapFunction<String, String>() {
-    @Override
-    public Iterable<String> call(String x) {
-      return Lists.newArrayList(x.split(" "));
-    }
-  });
-{% endhighlight %}
-
-The `words` is then mapped to a [JavaPairDStream](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaPairDStream) of `(word, 1)` pairs using `map` and [PairFunction](api/core/index.html#org.apache.spark.api.java.function.PairFunction). This is  reduced by using `reduceByKey` and [Function2](api/core/index.html#org.apache.spark.api.java.function.Function2).
-
-{% highlight java %}
-JavaPairDStream<String, Integer> wordCounts = words.map(
-  new PairFunction<String, String, Integer>() {
-    @Override
-    public Tuple2<String, Integer> call(String s) throws Exception {
-      return new Tuple2<String, Integer>(s, 1);
-    }
-  }).reduceByKey(
-  new Function2<Integer, Integer, Integer>() {
-    @Override
-    public Integer call(Integer i1, Integer i2) throws Exception {
-      return i1 + i2;
-    }
-  });
-{% endhighlight %}
-
+If the driver had crashed in the middle of the processing of time 3, then it will process time 3
+and output 30 after recovery.
 
 # Where to Go from Here
 
-* API docs - [Scala](api/streaming/index.html#org.apache.spark.streaming.package) and [Java](api/streaming/index.html#org.apache.spark.streaming.api.java.package)
-* More examples - [Scala](https://github.com/apache/incubator-spark/tree/master/examples/src/main/scala/org/apache/spark/streaming/examples) and [Java](https://github.com/apache/incubator-spark/tree/master/examples/src/main/java/org/apache/spark/streaming/examples)
-* [Paper describing Spark Streaming](http://www.eecs.berkeley.edu/Pubs/TechRpts/2012/EECS-2012-259.pdf)
+* API documentation
+  - Main docs of StreamingContext and DStreams in [Scala](api/streaming/index.html#org.apache.spark.streaming.package)
+    and [Java](api/streaming/index.html#org.apache.spark.streaming.api.java.package)
+  - Additional docs for
+    [Kafka](api/external/kafka/index.html#org.apache.spark.streaming.kafka.KafkaUtils$),
+    [Flume](api/external/flume/index.html#org.apache.spark.streaming.flume.FlumeUtils$),
+    [Twitter](api/external/twitter/index.html#org.apache.spark.streaming.twitter.TwitterUtils$),
+    [ZeroMQ](api/external/zeromq/index.html#org.apache.spark.streaming.zeromq.ZeroMQUtils$), and
+    [MQTT](api/external/mqtt/index.html#org.apache.spark.streaming.mqtt.MQTTUtils$)
+
+* More examples in [Scala]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/streaming/examples)
+  and [Java]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/streaming/examples)
+* [Paper](http://www.eecs.berkeley.edu/Pubs/TechRpts/2012/EECS-2012-259.pdf) describing Spark Streaming
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index e7cb5ab3ff9b0..e88f80aa62627 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -70,7 +70,7 @@ def parse_args():
            "slaves across multiple (an additional $0.01/Gb for bandwidth" +
            "between zones applies)")
   parser.add_option("-a", "--ami", help="Amazon Machine Image ID to use")
-  parser.add_option("-v", "--spark-version", default="0.8.0",
+  parser.add_option("-v", "--spark-version", default="0.9.0",
       help="Version of Spark to use: 'X.Y.Z' or a specific git hash")
   parser.add_option("--spark-git-repo",
       default="https://github.com/apache/incubator-spark",
@@ -157,7 +157,7 @@ def is_active(instance):
 
 # Return correct versions of Spark and Shark, given the supplied Spark version
 def get_spark_shark_version(opts):
-  spark_shark_map = {"0.7.3": "0.7.1", "0.8.0": "0.8.0"}
+  spark_shark_map = {"0.7.3": "0.7.1", "0.8.0": "0.8.0", "0.8.1": "0.8.1", "0.9.0": "0.9.0"}
   version = opts.spark_version.replace("v", "")
   if version not in spark_shark_map:
     print >> stderr, "Don't know about Spark version: %s" % version
@@ -189,7 +189,12 @@ def get_spark_ami(opts):
     "i2.xlarge":   "hvm",
     "i2.2xlarge":  "hvm",
     "i2.4xlarge":  "hvm",
-    "i2.8xlarge":  "hvm"
+    "i2.8xlarge":  "hvm",
+    "c3.large":    "pvm",
+    "c3.xlarge":   "pvm",
+    "c3.2xlarge":  "pvm",
+    "c3.4xlarge":  "pvm",
+    "c3.8xlarge":  "pvm"
   }
   if opts.instance_type in instance_types:
     instance_type = instance_types[opts.instance_type]
@@ -486,7 +491,12 @@ def get_num_disks(instance_type):
     "i2.xlarge":   1,
     "i2.2xlarge":  2,
     "i2.4xlarge":  4,
-    "i2.8xlarge":  8
+    "i2.8xlarge":  8,
+    "c3.large":    2,
+    "c3.xlarge":   2,
+    "c3.2xlarge":  2,
+    "c3.4xlarge":  2,
+    "c3.8xlarge":  2
   }
   if instance_type in disks_by_instance:
     return disks_by_instance[instance_type]
diff --git a/examples/pom.xml b/examples/pom.xml
index cb4f7ee33b4a1..967556744c1e6 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.1-incubating-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -71,6 +71,12 @@
       <version>${project.version}</version>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-graphx_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-streaming-twitter_${scala.binary.version}</artifactId>
diff --git a/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java b/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java
index 9eb1cadd71d22..a933f483c710f 100644
--- a/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java
+++ b/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java
@@ -124,7 +124,7 @@ public Stats call(Stats stats, Stats stats2) {
 
     List<Tuple2<Tuple3<String, String, String>, Stats>> output = counts.collect();
     for (Tuple2<?,?> t : output) {
-      System.out.println(t._1 + "\t" + t._2);
+      System.out.println(t._1() + "\t" + t._2());
     }
     System.exit(0);
   }
diff --git a/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java b/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java
index a84245b0c7449..cdcc2c9e67cf9 100644
--- a/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java
+++ b/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java
@@ -90,8 +90,8 @@ public Double call(List<String> rs) {
           @Override
           public Iterable<Tuple2<String, Double>> call(Tuple2<List<String>, Double> s) {
             List<Tuple2<String, Double>> results = new ArrayList<Tuple2<String, Double>>();
-            for (String n : s._1) {
-              results.add(new Tuple2<String, Double>(n, s._2 / s._1.size()));
+            for (String n : s._1()) {
+              results.add(new Tuple2<String, Double>(n, s._2() / s._1().size()));
             }
             return results;
           }
@@ -109,7 +109,7 @@ public Double call(Double sum) {
     // Collects all URL ranks and dump them to console.
     List<Tuple2<String, Double>> output = ranks.collect();
     for (Tuple2<?,?> tuple : output) {
-        System.out.println(tuple._1 + " has rank: " + tuple._2 + ".");
+        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
     }
 
     System.exit(0);
diff --git a/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java b/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java
index 3ec4a58d48ed6..ac8df02c4630b 100644
--- a/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java
+++ b/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java
@@ -30,11 +30,11 @@ public final class JavaSparkPi {
 
   public static void main(String[] args) throws Exception {
     if (args.length == 0) {
-      System.err.println("Usage: JavaLogQuery <master> [slices]");
+      System.err.println("Usage: JavaSparkPi <master> [slices]");
       System.exit(1);
     }
 
-    JavaSparkContext jsc = new JavaSparkContext(args[0], "JavaLogQuery",
+    JavaSparkContext jsc = new JavaSparkContext(args[0], "JavaSparkPi",
       System.getenv("SPARK_HOME"), JavaSparkContext.jarOfClass(JavaSparkPi.class));
 
     int slices = (args.length == 2) ? Integer.parseInt(args[1]) : 2;
diff --git a/examples/src/main/java/org/apache/spark/examples/JavaWordCount.java b/examples/src/main/java/org/apache/spark/examples/JavaWordCount.java
index 6651f98d56711..95a2f3ab5290d 100644
--- a/examples/src/main/java/org/apache/spark/examples/JavaWordCount.java
+++ b/examples/src/main/java/org/apache/spark/examples/JavaWordCount.java
@@ -65,7 +65,7 @@ public Integer call(Integer i1, Integer i2) {
 
     List<Tuple2<String, Integer>> output = counts.collect();
     for (Tuple2<?,?> tuple : output) {
-      System.out.println(tuple._1 + ": " + tuple._2);
+      System.out.println(tuple._1() + ": " + tuple._2());
     }
     System.exit(0);
   }
diff --git a/examples/src/main/java/org/apache/spark/streaming/examples/JavaFlumeEventCount.java b/examples/src/main/java/org/apache/spark/streaming/examples/JavaFlumeEventCount.java
index 7b5a243e26414..f061001dd264d 100644
--- a/examples/src/main/java/org/apache/spark/streaming/examples/JavaFlumeEventCount.java
+++ b/examples/src/main/java/org/apache/spark/streaming/examples/JavaFlumeEventCount.java
@@ -70,5 +70,6 @@ public String call(Long in) {
     }).print();
 
     ssc.start();
+    ssc.awaitTermination();
   }
 }
diff --git a/examples/src/main/java/org/apache/spark/streaming/examples/JavaKafkaWordCount.java b/examples/src/main/java/org/apache/spark/streaming/examples/JavaKafkaWordCount.java
index 04f62ee204145..2ffd351b4e498 100644
--- a/examples/src/main/java/org/apache/spark/streaming/examples/JavaKafkaWordCount.java
+++ b/examples/src/main/java/org/apache/spark/streaming/examples/JavaKafkaWordCount.java
@@ -104,5 +104,6 @@ public Integer call(Integer i1, Integer i2) {
 
     wordCounts.print();
     jssc.start();
+    jssc.awaitTermination();
   }
 }
diff --git a/examples/src/main/java/org/apache/spark/streaming/examples/JavaNetworkWordCount.java b/examples/src/main/java/org/apache/spark/streaming/examples/JavaNetworkWordCount.java
index 349d826ab5df7..7777c9832abd3 100644
--- a/examples/src/main/java/org/apache/spark/streaming/examples/JavaNetworkWordCount.java
+++ b/examples/src/main/java/org/apache/spark/streaming/examples/JavaNetworkWordCount.java
@@ -84,5 +84,6 @@ public Integer call(Integer i1, Integer i2) {
 
     wordCounts.print();
     ssc.start();
+    ssc.awaitTermination();
   }
 }
diff --git a/examples/src/main/java/org/apache/spark/streaming/examples/JavaQueueStream.java b/examples/src/main/java/org/apache/spark/streaming/examples/JavaQueueStream.java
index 7ef9c6c8f4aaf..26c44620abec1 100644
--- a/examples/src/main/java/org/apache/spark/streaming/examples/JavaQueueStream.java
+++ b/examples/src/main/java/org/apache/spark/streaming/examples/JavaQueueStream.java
@@ -58,10 +58,9 @@ public static void main(String[] args) throws Exception {
     }
 
     for (int i = 0; i < 30; i++) {
-      rddQueue.add(ssc.sc().parallelize(list));
+      rddQueue.add(ssc.sparkContext().parallelize(list));
     }
 
-
     // Create the QueueInputDStream and use it do some processing
     JavaDStream<Integer> inputStream = ssc.queueStream(rddQueue);
     JavaPairDStream<Integer, Integer> mappedStream = inputStream.map(
@@ -81,5 +80,6 @@ public Integer call(Integer i1, Integer i2) {
 
     reducedStream.print();
     ssc.start();
+    ssc.awaitTermination();
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/ActorWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/ActorWordCount.scala
index 57e1b1f806e82..a5888811cc5ea 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/ActorWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/ActorWordCount.scala
@@ -88,7 +88,7 @@ extends Actor with Receiver {
   override def preStart = remotePublisher ! SubscribeReceiver(context.self)
 
   def receive = {
-    case msg ⇒ context.parent ! pushBlock(msg.asInstanceOf[T])
+    case msg ⇒ pushBlock(msg.asInstanceOf[T])
   }
 
   override def postStop() = remotePublisher ! UnsubscribeReceiver(context.self)
@@ -171,5 +171,6 @@ object ActorWordCount {
     lines.flatMap(_.split("\\s+")).map(x => (x, 1)).reduceByKey(_ + _).print()
 
     ssc.start()
+    ssc.awaitTermination()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/FlumeEventCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/FlumeEventCount.scala
index a59be7899dd37..11c3aaad3c8a8 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/FlumeEventCount.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/FlumeEventCount.scala
@@ -60,5 +60,6 @@ object FlumeEventCount {
     stream.count().map(cnt => "Received " + cnt + " flume events." ).print()
 
     ssc.start()
+    ssc.awaitTermination()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/HdfsWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/HdfsWordCount.scala
index 704b315ef8b22..954bcc9b6ef5d 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/HdfsWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/HdfsWordCount.scala
@@ -50,6 +50,7 @@ object HdfsWordCount {
     val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
     wordCounts.print()
     ssc.start()
+    ssc.awaitTermination()
   }
 }
 
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/KafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/KafkaWordCount.scala
index 4a3d81c09a122..d9cb7326bb97d 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/KafkaWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/KafkaWordCount.scala
@@ -61,6 +61,7 @@ object KafkaWordCount {
     wordCounts.print()
     
     ssc.start()
+    ssc.awaitTermination()
   }
 }
 
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/MQTTWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/MQTTWordCount.scala
index 78b49fdcf1eb3..eb61caf8c85b9 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/MQTTWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/MQTTWordCount.scala
@@ -101,5 +101,6 @@ object MQTTWordCount {
     val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
     wordCounts.print()
     ssc.start()
+    ssc.awaitTermination()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala
index 02264757123db..5656d487a57cc 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala
@@ -54,5 +54,6 @@ object NetworkWordCount {
     val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
     wordCounts.print()
     ssc.start()
+    ssc.awaitTermination()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/RawNetworkGrep.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/RawNetworkGrep.scala
index 99b79c3949a4e..cdd7547d0d3b4 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/RawNetworkGrep.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/RawNetworkGrep.scala
@@ -61,5 +61,6 @@ object RawNetworkGrep {
     union.filter(_.contains("the")).count().foreachRDD(r =>
       println("Grep count: " + r.collect().mkString))
     ssc.start()
+    ssc.awaitTermination()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/RecoverableNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/RecoverableNetworkWordCount.scala
index 8c5d0bd56845b..aa82bf3c6bd8e 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/RecoverableNetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/RecoverableNetworkWordCount.scala
@@ -114,5 +114,6 @@ object RecoverableNetworkWordCount {
         createContext(master, ip, port, outputPath)
       })
     ssc.start()
+    ssc.awaitTermination()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/StatefulNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/StatefulNetworkWordCount.scala
index 1183eba84686b..88f1cef89b318 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/StatefulNetworkWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/StatefulNetworkWordCount.scala
@@ -65,5 +65,6 @@ object StatefulNetworkWordCount {
     val stateDstream = wordDstream.updateStateByKey[Int](updateFunc)
     stateDstream.print()
     ssc.start()
+    ssc.awaitTermination()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/StreamingExamples.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/StreamingExamples.scala
index d41d84a980dc7..99f1502046f53 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/StreamingExamples.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/StreamingExamples.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.streaming.examples
 
 import org.apache.spark.Logging
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdCMS.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdCMS.scala
index 483c4d311810f..bbd44948b6fa5 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdCMS.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdCMS.scala
@@ -110,5 +110,6 @@ object TwitterAlgebirdCMS {
     })
 
     ssc.start()
+    ssc.awaitTermination()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdHLL.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdHLL.scala
index 94c2bf29ac433..a0094d460feec 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdHLL.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdHLL.scala
@@ -87,5 +87,6 @@ object TwitterAlgebirdHLL {
     })
 
     ssc.start()
+    ssc.awaitTermination()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterPopularTags.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterPopularTags.scala
index 8a70d4a978cd4..896d010c68f18 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterPopularTags.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterPopularTags.scala
@@ -69,5 +69,6 @@ object TwitterPopularTags {
     })
 
     ssc.start()
+    ssc.awaitTermination()
   }
 }
diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/ZeroMQWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/ZeroMQWordCount.scala
index 12d2a1084f900..85b4ce5e81950 100644
--- a/examples/src/main/scala/org/apache/spark/streaming/examples/ZeroMQWordCount.scala
+++ b/examples/src/main/scala/org/apache/spark/streaming/examples/ZeroMQWordCount.scala
@@ -91,5 +91,6 @@ object ZeroMQWordCount {
     val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _)
     wordCounts.print()
     ssc.start()
+    ssc.awaitTermination()
   }
 }
diff --git a/external/flume/pom.xml b/external/flume/pom.xml
index 443910a03a94e..978b99f4a7054 100644
--- a/external/flume/pom.xml
+++ b/external/flume/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.1-incubating-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/flume/src/test/resources/log4j.properties b/external/flume/src/test/resources/log4j.properties
index 063529a9cbc67..d1bd73a8430e1 100644
--- a/external/flume/src/test/resources/log4j.properties
+++ b/external/flume/src/test/resources/log4j.properties
@@ -20,7 +20,7 @@ log4j.rootCategory=INFO, file
 # log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file.append=false
-log4j.appender.file.file=streaming/target/unit-tests.log
+log4j.appender.file.file=external/flume/target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
 
diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml
index 23b2fead657e6..a3d5fc64f070e 100644
--- a/external/kafka/pom.xml
+++ b/external/kafka/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.1-incubating-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/kafka/src/test/resources/log4j.properties b/external/kafka/src/test/resources/log4j.properties
index 063529a9cbc67..38910d113050a 100644
--- a/external/kafka/src/test/resources/log4j.properties
+++ b/external/kafka/src/test/resources/log4j.properties
@@ -20,7 +20,7 @@ log4j.rootCategory=INFO, file
 # log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file.append=false
-log4j.appender.file.file=streaming/target/unit-tests.log
+log4j.appender.file.file=external/kafka/target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
 
diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala
index 9c81f23c19118..d9809f6409d44 100644
--- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala
+++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala
@@ -35,5 +35,6 @@ class KafkaStreamSuite extends TestSuiteBase {
       ssc, kafkaParams, topics, StorageLevel.MEMORY_AND_DISK_SER_2)
 
     // TODO: Actually test receiving data
+    ssc.stop()
   }
 }
diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml
index 31b4fa87de772..1f416dd8c06d4 100644
--- a/external/mqtt/pom.xml
+++ b/external/mqtt/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.1-incubating-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/mqtt/src/test/resources/log4j.properties b/external/mqtt/src/test/resources/log4j.properties
index 063529a9cbc67..d0462c7336df5 100644
--- a/external/mqtt/src/test/resources/log4j.properties
+++ b/external/mqtt/src/test/resources/log4j.properties
@@ -20,7 +20,7 @@ log4j.rootCategory=INFO, file
 # log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file.append=false
-log4j.appender.file.file=streaming/target/unit-tests.log
+log4j.appender.file.file=external/mqtt/target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
 
diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
index 73e7ce6e968c6..89c40ad4619c9 100644
--- a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
+++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala
@@ -32,5 +32,6 @@ class MQTTStreamSuite extends TestSuiteBase {
     val test2 = MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_AND_DISK_SER_2)
 
     // TODO: Actually test receiving data
+    ssc.stop()
   }
 }
diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml
index 216e6c1d8ff44..f23091684f95c 100644
--- a/external/twitter/pom.xml
+++ b/external/twitter/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.1-incubating-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/twitter/src/test/resources/log4j.properties b/external/twitter/src/test/resources/log4j.properties
index 063529a9cbc67..c918335fcdc70 100644
--- a/external/twitter/src/test/resources/log4j.properties
+++ b/external/twitter/src/test/resources/log4j.properties
@@ -20,7 +20,7 @@ log4j.rootCategory=INFO, file
 # log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file.append=false
-log4j.appender.file.file=streaming/target/unit-tests.log
+log4j.appender.file.file=external/twitter/target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
 
diff --git a/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala b/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala
index ccc38784ef671..06ab0cdaf3b4e 100644
--- a/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala
+++ b/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala
@@ -39,5 +39,6 @@ class TwitterStreamSuite extends TestSuiteBase {
 
     // Note that actually testing the data receiving is hard as authentication keys are
     // necessary for accessing Twitter live stream
+    ssc.stop()
   }
 }
diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml
index c240d595742cf..6a250b3916ead 100644
--- a/external/zeromq/pom.xml
+++ b/external/zeromq/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.1-incubating-SNAPSHOT</version>
     <relativePath>../../pom.xml</relativePath>
   </parent>
 
diff --git a/external/zeromq/src/test/resources/log4j.properties b/external/zeromq/src/test/resources/log4j.properties
index 063529a9cbc67..304683dd0bac3 100644
--- a/external/zeromq/src/test/resources/log4j.properties
+++ b/external/zeromq/src/test/resources/log4j.properties
@@ -20,7 +20,7 @@ log4j.rootCategory=INFO, file
 # log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file=org.apache.log4j.FileAppender
 log4j.appender.file.append=false
-log4j.appender.file.file=streaming/target/unit-tests.log
+log4j.appender.file.file=external/zeromq/target/unit-tests.log
 log4j.appender.file.layout=org.apache.log4j.PatternLayout
 log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n
 
diff --git a/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala b/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala
index 4193b8a02f14a..92d55a7a7b6e4 100644
--- a/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala
+++ b/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala
@@ -40,5 +40,6 @@ class ZeroMQStreamSuite extends TestSuiteBase {
       StorageLevel.MEMORY_AND_DISK_SER_2, SupervisorStrategy.defaultStrategy)
 
     // TODO: Actually test data receiving
+    ssc.stop()
   }
 }
diff --git a/extras/spark-ganglia-lgpl/pom.xml b/extras/spark-ganglia-lgpl/pom.xml
new file mode 100644
index 0000000000000..e39b07dd2f15d
--- /dev/null
+++ b/extras/spark-ganglia-lgpl/pom.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+~ Licensed to the Apache Software Foundation (ASF) under one or more
+~ contributor license agreements.  See the NOTICE file distributed with
+~ this work for additional information regarding copyright ownership.
+~ The ASF licenses this file to You under the Apache License, Version 2.0
+~ (the "License"); you may not use this file except in compliance with
+~ the License.  You may obtain a copy of the License at
+~
+~    http://www.apache.org/licenses/LICENSE-2.0
+~
+~ Unless required by applicable law or agreed to in writing, software
+~ distributed under the License is distributed on an "AS IS" BASIS,
+~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+~ See the License for the specific language governing permissions and
+~ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent</artifactId>
+    <version>0.9.1-incubating-SNAPSHOT</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+
+  <!-- Ganglia integration is not included by default due to LGPL-licensed code -->
+  <groupId>org.apache.spark</groupId>
+  <artifactId>spark-ganglia-lgpl_2.10</artifactId>
+  <packaging>jar</packaging>
+  <name>Spark Ganglia Integration</name>
+  
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>com.codahale.metrics</groupId>
+      <artifactId>metrics-ganglia</artifactId>
+    </dependency>
+  </dependencies>
+</project>
diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala b/extras/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala
similarity index 100%
rename from core/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala
rename to extras/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala
diff --git a/graphx/pom.xml b/graphx/pom.xml
index 3e5faf230dbc9..9073cf7aa66ec 100644
--- a/graphx/pom.xml
+++ b/graphx/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.1-incubating-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -36,7 +36,11 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
-      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-math3</artifactId>
+      <version>3.2</version>
     </dependency>
     <dependency>
       <groupId>org.eclipse.jetty</groupId>
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Edge.scala b/graphx/src/main/scala/org/apache/spark/graphx/Edge.scala
index 738a38b27f0e4..580faa0866789 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/Edge.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/Edge.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 /**
@@ -11,8 +28,8 @@ package org.apache.spark.graphx
  * @param attr The attribute associated with the edge
  */
 case class Edge[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED] (
-    var srcId: VertexID = 0,
-    var dstId: VertexID = 0,
+    var srcId: VertexId = 0,
+    var dstId: VertexId = 0,
     var attr: ED = null.asInstanceOf[ED])
   extends Serializable {
 
@@ -22,7 +39,7 @@ case class Edge[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED]
    * @param vid the id one of the two vertices on the edge.
    * @return the id of the other vertex on the edge.
    */
-  def otherVertexId(vid: VertexID): VertexID =
+  def otherVertexId(vid: VertexId): VertexId =
     if (srcId == vid) dstId else { assert(dstId == vid); srcId }
 
   /**
@@ -33,7 +50,7 @@ case class Edge[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED]
    * @return the relative direction of the edge to the corresponding
    * vertex.
    */
-  def relativeDirection(vid: VertexID): EdgeDirection =
+  def relativeDirection(vid: VertexId): EdgeDirection =
     if (vid == srcId) EdgeDirection.Out else { assert(vid == dstId); EdgeDirection.In }
 }
 
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/EdgeDirection.scala b/graphx/src/main/scala/org/apache/spark/graphx/EdgeDirection.scala
index f265764006234..6f03eb1439773 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/EdgeDirection.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/EdgeDirection.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 /**
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala
index 832b7816fe833..fe03ae4a629b9 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 import scala.reflect.{classTag, ClassTag}
@@ -85,7 +102,7 @@ class EdgeRDD[@specialized ED: ClassTag](
    */
   def innerJoin[ED2: ClassTag, ED3: ClassTag]
       (other: EdgeRDD[ED2])
-      (f: (VertexID, VertexID, ED, ED2) => ED3): EdgeRDD[ED3] = {
+      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDD[ED3] = {
     val ed2Tag = classTag[ED2]
     val ed3Tag = classTag[ED3]
     new EdgeRDD[ED3](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
@@ -96,7 +113,7 @@ class EdgeRDD[@specialized ED: ClassTag](
     })
   }
 
-  private[graphx] def collectVertexIDs(): RDD[VertexID] = {
+  private[graphx] def collectVertexIds(): RDD[VertexId] = {
     partitionsRDD.flatMap { case (_, p) => Array.concat(p.srcIds, p.dstIds) }
   }
 }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/EdgeTriplet.scala b/graphx/src/main/scala/org/apache/spark/graphx/EdgeTriplet.scala
index 4253b24b5ac55..fea43c3b2bbf1 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/EdgeTriplet.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/EdgeTriplet.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 /**
@@ -33,7 +50,7 @@ class EdgeTriplet[VD, ED] extends Edge[ED] {
    * @param vid the id one of the two vertices on the edge
    * @return the attribute for the other vertex on the edge
    */
-  def otherVertexAttr(vid: VertexID): VD =
+  def otherVertexAttr(vid: VertexId): VD =
     if (srcId == vid) dstAttr else { assert(dstId == vid); srcAttr }
 
   /**
@@ -42,7 +59,7 @@ class EdgeTriplet[VD, ED] extends Edge[ED] {
    * @param vid the id of one of the two vertices on the edge
    * @return the attr for the vertex with that id
    */
-  def vertexAttr(vid: VertexID): VD =
+  def vertexAttr(vid: VertexId): VD =
     if (srcId == vid) srcAttr else { assert(dstId == vid); dstAttr }
 
   override def toString = ((srcId, srcAttr), (dstId, dstAttr), attr).toString()
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
index 9dd05ade0aef2..eea95d38d5016 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 import scala.reflect.ClassTag
@@ -109,7 +126,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    * }}}
    *
    */
-  def mapVertices[VD2: ClassTag](map: (VertexID, VD) => VD2): Graph[VD2, ED]
+  def mapVertices[VD2: ClassTag](map: (VertexId, VD) => VD2): Graph[VD2, ED]
 
   /**
    * Transforms each edge attribute in the graph using the map function.  The map function is not
@@ -225,7 +242,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    */
   def subgraph(
       epred: EdgeTriplet[VD,ED] => Boolean = (x => true),
-      vpred: (VertexID, VD) => Boolean = ((v, d) => true))
+      vpred: (VertexId, VD) => Boolean = ((v, d) => true))
     : Graph[VD, ED]
 
   /**
@@ -275,7 +292,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    * vertex
    * {{{
    * val rawGraph: Graph[(),()] = Graph.textFile("twittergraph")
-   * val inDeg: RDD[(VertexID, Int)] =
+   * val inDeg: RDD[(VertexId, Int)] =
    *   mapReduceTriplets[Int](et => Iterator((et.dst.id, 1)), _ + _)
    * }}}
    *
@@ -287,7 +304,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    *
    */
   def mapReduceTriplets[A: ClassTag](
-      mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexID, A)],
+      mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)],
       reduceFunc: (A, A) => A,
       activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None)
     : VertexRDD[A]
@@ -311,14 +328,14 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab
    *
    * {{{
    * val rawGraph: Graph[_, _] = Graph.textFile("webgraph")
-   * val outDeg: RDD[(VertexID, Int)] = rawGraph.outDegrees()
+   * val outDeg: RDD[(VertexId, Int)] = rawGraph.outDegrees()
    * val graph = rawGraph.outerJoinVertices(outDeg) {
    *   (vid, data, optDeg) => optDeg.getOrElse(0)
    * }
    * }}}
    */
-  def outerJoinVertices[U: ClassTag, VD2: ClassTag](other: RDD[(VertexID, U)])
-      (mapFunc: (VertexID, VD, Option[U]) => VD2)
+  def outerJoinVertices[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)])
+      (mapFunc: (VertexId, VD, Option[U]) => VD2)
     : Graph[VD2, ED]
 
   /**
@@ -347,7 +364,7 @@ object Graph {
    * (if `uniqueEdges` is `None`) and vertex attributes containing the total degree of each vertex.
    */
   def fromEdgeTuples[VD: ClassTag](
-      rawEdges: RDD[(VertexID, VertexID)],
+      rawEdges: RDD[(VertexId, VertexId)],
       defaultValue: VD,
       uniqueEdges: Option[PartitionStrategy] = None): Graph[VD, Int] =
   {
@@ -388,7 +405,7 @@ object Graph {
    *                          mentioned in edges but not in vertices
    */
   def apply[VD: ClassTag, ED: ClassTag](
-      vertices: RDD[(VertexID, VD)],
+      vertices: RDD[(VertexId, VD)],
       edges: RDD[Edge[ED]],
       defaultVertexAttr: VD = null.asInstanceOf[VD]): Graph[VD, ED] = {
     GraphImpl(vertices, edges, defaultVertexAttr)
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphKryoRegistrator.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphKryoRegistrator.scala
index d79bdf961841b..dd380d8c182c9 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/GraphKryoRegistrator.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphKryoRegistrator.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 import com.esotericsoftware.kryo.Kryo
@@ -16,7 +33,7 @@ class GraphKryoRegistrator extends KryoRegistrator {
     kryo.register(classOf[Edge[Object]])
     kryo.register(classOf[MessageToPartition[Object]])
     kryo.register(classOf[VertexBroadcastMsg[Object]])
-    kryo.register(classOf[(VertexID, Object)])
+    kryo.register(classOf[(VertexId, Object)])
     kryo.register(classOf[EdgePartition[Object]])
     kryo.register(classOf[BitSet])
     kryo.register(classOf[VertexIdToIndexMap])
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
index 5904aa3a28c71..18858466db27b 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 import org.apache.spark.{Logging, SparkContext}
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
index f10e63f059aed..0fc1e4df6813c 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 import scala.reflect.ClassTag
@@ -63,19 +80,19 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
    *
    * @return the set of neighboring ids for each vertex
    */
-  def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexID]] = {
+  def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexId]] = {
     val nbrs =
       if (edgeDirection == EdgeDirection.Either) {
-        graph.mapReduceTriplets[Array[VertexID]](
+        graph.mapReduceTriplets[Array[VertexId]](
           mapFunc = et => Iterator((et.srcId, Array(et.dstId)), (et.dstId, Array(et.srcId))),
           reduceFunc = _ ++ _
         )
       } else if (edgeDirection == EdgeDirection.Out) {
-        graph.mapReduceTriplets[Array[VertexID]](
+        graph.mapReduceTriplets[Array[VertexId]](
           mapFunc = et => Iterator((et.srcId, Array(et.dstId))),
           reduceFunc = _ ++ _)
       } else if (edgeDirection == EdgeDirection.In) {
-        graph.mapReduceTriplets[Array[VertexID]](
+        graph.mapReduceTriplets[Array[VertexId]](
           mapFunc = et => Iterator((et.dstId, Array(et.srcId))),
           reduceFunc = _ ++ _)
       } else {
@@ -83,7 +100,7 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
           "direction. (EdgeDirection.Both is not supported; use EdgeDirection.Either instead.)")
       }
     graph.vertices.leftZipJoin(nbrs) { (vid, vdata, nbrsOpt) =>
-      nbrsOpt.getOrElse(Array.empty[VertexID])
+      nbrsOpt.getOrElse(Array.empty[VertexId])
     }
   } // end of collectNeighborIds
 
@@ -99,8 +116,8 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
    *
    * @return the vertex set of neighboring vertex attributes for each vertex
    */
-  def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[Array[(VertexID, VD)]] = {
-    val nbrs = graph.mapReduceTriplets[Array[(VertexID,VD)]](
+  def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[Array[(VertexId, VD)]] = {
+    val nbrs = graph.mapReduceTriplets[Array[(VertexId,VD)]](
       edge => {
         val msgToSrc = (edge.srcId, Array((edge.dstId, edge.dstAttr)))
         val msgToDst = (edge.dstId, Array((edge.srcId, edge.srcAttr)))
@@ -116,7 +133,7 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
       (a, b) => a ++ b)
 
     graph.vertices.leftZipJoin(nbrs) { (vid, vdata, nbrsOpt) =>
-      nbrsOpt.getOrElse(Array.empty[(VertexID, VD)])
+      nbrsOpt.getOrElse(Array.empty[(VertexId, VD)])
     }
   } // end of collectNeighbor
 
@@ -147,9 +164,9 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
    * }}}
    *
    */
-  def joinVertices[U: ClassTag](table: RDD[(VertexID, U)])(mapFunc: (VertexID, VD, U) => VD)
+  def joinVertices[U: ClassTag](table: RDD[(VertexId, U)])(mapFunc: (VertexId, VD, U) => VD)
     : Graph[VD, ED] = {
-    val uf = (id: VertexID, data: VD, o: Option[U]) => {
+    val uf = (id: VertexId, data: VD, o: Option[U]) => {
       o match {
         case Some(u) => mapFunc(id, data, u)
         case None => data
@@ -180,7 +197,7 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
    *     val degrees: VertexRDD[Int] = graph.outDegrees
    *     graph.outerJoinVertices(degrees) {(vid, data, deg) => deg.getOrElse(0)}
    *   },
-   *   vpred = (vid: VertexID, deg:Int) => deg > 0
+   *   vpred = (vid: VertexId, deg:Int) => deg > 0
    * )
    * }}}
    *
@@ -188,7 +205,7 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
   def filter[VD2: ClassTag, ED2: ClassTag](
       preprocess: Graph[VD, ED] => Graph[VD2, ED2],
       epred: (EdgeTriplet[VD2, ED2]) => Boolean = (x: EdgeTriplet[VD2, ED2]) => true,
-      vpred: (VertexID, VD2) => Boolean = (v:VertexID, d:VD2) => true): Graph[VD, ED] = {
+      vpred: (VertexId, VD2) => Boolean = (v:VertexId, d:VD2) => true): Graph[VD, ED] = {
     graph.mask(preprocess(graph).subgraph(epred, vpred))
   }
 
@@ -243,8 +260,8 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
       initialMsg: A,
       maxIterations: Int = Int.MaxValue,
       activeDirection: EdgeDirection = EdgeDirection.Either)(
-      vprog: (VertexID, VD, A) => VD,
-      sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexID,A)],
+      vprog: (VertexId, VD, A) => VD,
+      sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexId,A)],
       mergeMsg: (A, A) => A)
     : Graph[VD, ED] = {
     Pregel(graph, initialMsg, maxIterations, activeDirection)(vprog, sendMsg, mergeMsg)
@@ -276,7 +293,7 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
    *
    * @see [[org.apache.spark.graphx.lib.ConnectedComponents$#run]]
    */
-  def connectedComponents(): Graph[VertexID, ED] = {
+  def connectedComponents(): Graph[VertexId, ED] = {
     ConnectedComponents.run(graph)
   }
 
@@ -295,7 +312,7 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali
    *
    * @see [[org.apache.spark.graphx.lib.StronglyConnectedComponents$#run]]
    */
-  def stronglyConnectedComponents(numIter: Int): Graph[VertexID, ED] = {
+  def stronglyConnectedComponents(numIter: Int): Graph[VertexId, ED] = {
     StronglyConnectedComponents.run(graph, numIter)
   }
 } // end of GraphOps
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala b/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala
index 6d2990a3f6642..929915362c1c9 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 /**
@@ -6,7 +23,7 @@ package org.apache.spark.graphx
  */
 trait PartitionStrategy extends Serializable {
   /** Returns the partition number for a given edge. */
-  def getPartition(src: VertexID, dst: VertexID, numParts: PartitionID): PartitionID
+  def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID
 }
 
 /**
@@ -56,9 +73,9 @@ object PartitionStrategy {
    * is used.
    */
   case object EdgePartition2D extends PartitionStrategy {
-    override def getPartition(src: VertexID, dst: VertexID, numParts: PartitionID): PartitionID = {
+    override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = {
       val ceilSqrtNumParts: PartitionID = math.ceil(math.sqrt(numParts)).toInt
-      val mixingPrime: VertexID = 1125899906842597L
+      val mixingPrime: VertexId = 1125899906842597L
       val col: PartitionID = ((math.abs(src) * mixingPrime) % ceilSqrtNumParts).toInt
       val row: PartitionID = ((math.abs(dst) * mixingPrime) % ceilSqrtNumParts).toInt
       (col * ceilSqrtNumParts + row) % numParts
@@ -70,8 +87,8 @@ object PartitionStrategy {
    * source.
    */
   case object EdgePartition1D extends PartitionStrategy {
-    override def getPartition(src: VertexID, dst: VertexID, numParts: PartitionID): PartitionID = {
-      val mixingPrime: VertexID = 1125899906842597L
+    override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = {
+      val mixingPrime: VertexId = 1125899906842597L
       (math.abs(src) * mixingPrime).toInt % numParts
     }
   }
@@ -82,7 +99,7 @@ object PartitionStrategy {
    * random vertex cut that colocates all same-direction edges between two vertices.
    */
   case object RandomVertexCut extends PartitionStrategy {
-    override def getPartition(src: VertexID, dst: VertexID, numParts: PartitionID): PartitionID = {
+    override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = {
       math.abs((src, dst).hashCode()) % numParts
     }
   }
@@ -94,7 +111,7 @@ object PartitionStrategy {
    * regardless of direction.
    */
   case object CanonicalRandomVertexCut extends PartitionStrategy {
-    override def getPartition(src: VertexID, dst: VertexID, numParts: PartitionID): PartitionID = {
+    override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = {
       val lower = math.min(src, dst)
       val higher = math.max(src, dst)
       math.abs((lower, higher).hashCode()) % numParts
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala b/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala
index fc18f7e785a99..ac07a594a12e4 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 import scala.reflect.ClassTag
@@ -23,9 +40,9 @@ import scala.reflect.ClassTag
  *   // Set the vertex attributes to the initial pagerank values
  *   .mapVertices((id, attr) => 1.0)
  *
- * def vertexProgram(id: VertexID, attr: Double, msgSum: Double): Double =
+ * def vertexProgram(id: VertexId, attr: Double, msgSum: Double): Double =
  *   resetProb + (1.0 - resetProb) * msgSum
- * def sendMessage(id: VertexID, edge: EdgeTriplet[Double, Double]): Iterator[(VertexId, Double)] =
+ * def sendMessage(id: VertexId, edge: EdgeTriplet[Double, Double]): Iterator[(VertexId, Double)] =
  *   Iterator((edge.dstId, edge.srcAttr * edge.attr))
  * def messageCombiner(a: Double, b: Double): Double = a + b
  * val initialMessage = 0.0
@@ -96,8 +113,8 @@ object Pregel {
       initialMsg: A,
       maxIterations: Int = Int.MaxValue,
       activeDirection: EdgeDirection = EdgeDirection.Either)
-     (vprog: (VertexID, VD, A) => VD,
-      sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexID, A)],
+     (vprog: (VertexId, VD, A) => VD,
+      sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)],
       mergeMsg: (A, A) => A)
     : Graph[VD, ED] =
   {
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala
index 9a95364cb16dd..edd59bcf32943 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala
@@ -28,7 +28,7 @@ import org.apache.spark.graphx.impl.MsgRDDFunctions
 import org.apache.spark.graphx.impl.VertexPartition
 
 /**
- * Extends `RDD[(VertexID, VD)]` by ensuring that there is only one entry for each vertex and by
+ * Extends `RDD[(VertexId, VD)]` by ensuring that there is only one entry for each vertex and by
  * pre-indexing the entries for fast, efficient joins. Two VertexRDDs with the same index can be
  * joined efficiently. All operations except [[reindex]] preserve the index. To construct a
  * `VertexRDD`, use the [[org.apache.spark.graphx.VertexRDD$ VertexRDD object]].
@@ -36,12 +36,12 @@ import org.apache.spark.graphx.impl.VertexPartition
  * @example Construct a `VertexRDD` from a plain RDD:
  * {{{
  * // Construct an initial vertex set
- * val someData: RDD[(VertexID, SomeType)] = loadData(someFile)
+ * val someData: RDD[(VertexId, SomeType)] = loadData(someFile)
  * val vset = VertexRDD(someData)
  * // If there were redundant values in someData we would use a reduceFunc
  * val vset2 = VertexRDD(someData, reduceFunc)
  * // Finally we can use the VertexRDD to index another dataset
- * val otherData: RDD[(VertexID, OtherType)] = loadData(otherFile)
+ * val otherData: RDD[(VertexId, OtherType)] = loadData(otherFile)
  * val vset3 = vset2.innerJoin(otherData) { (vid, a, b) => b }
  * // Now we can construct very fast joins between the two sets
  * val vset4: VertexRDD[(SomeType, OtherType)] = vset.leftJoin(vset3)
@@ -51,7 +51,7 @@ import org.apache.spark.graphx.impl.VertexPartition
  */
 class VertexRDD[@specialized VD: ClassTag](
     val partitionsRDD: RDD[VertexPartition[VD]])
-  extends RDD[(VertexID, VD)](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {
+  extends RDD[(VertexId, VD)](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {
 
   require(partitionsRDD.partitioner.isDefined)
 
@@ -92,9 +92,9 @@ class VertexRDD[@specialized VD: ClassTag](
   }
 
   /**
-   * Provides the `RDD[(VertexID, VD)]` equivalent output.
+   * Provides the `RDD[(VertexId, VD)]` equivalent output.
    */
-  override def compute(part: Partition, context: TaskContext): Iterator[(VertexID, VD)] = {
+  override def compute(part: Partition, context: TaskContext): Iterator[(VertexId, VD)] = {
     firstParent[VertexPartition[VD]].iterator(part, context).next.iterator
   }
 
@@ -114,9 +114,9 @@ class VertexRDD[@specialized VD: ClassTag](
    * rather than allocating new memory.
    *
    * @param pred the user defined predicate, which takes a tuple to conform to the
-   * `RDD[(VertexID, VD)]` interface
+   * `RDD[(VertexId, VD)]` interface
    */
-  override def filter(pred: Tuple2[VertexID, VD] => Boolean): VertexRDD[VD] =
+  override def filter(pred: Tuple2[VertexId, VD] => Boolean): VertexRDD[VD] =
     this.mapVertexPartitions(_.filter(Function.untupled(pred)))
 
   /**
@@ -140,7 +140,7 @@ class VertexRDD[@specialized VD: ClassTag](
    * @return a new VertexRDD with values obtained by applying `f` to each of the entries in the
    * original VertexRDD.  The resulting VertexRDD retains the same index.
    */
-  def mapValues[VD2: ClassTag](f: (VertexID, VD) => VD2): VertexRDD[VD2] =
+  def mapValues[VD2: ClassTag](f: (VertexId, VD) => VD2): VertexRDD[VD2] =
     this.mapVertexPartitions(_.map(f))
 
   /**
@@ -172,7 +172,7 @@ class VertexRDD[@specialized VD: ClassTag](
    * @return a VertexRDD containing the results of `f`
    */
   def leftZipJoin[VD2: ClassTag, VD3: ClassTag]
-      (other: VertexRDD[VD2])(f: (VertexID, VD, Option[VD2]) => VD3): VertexRDD[VD3] = {
+      (other: VertexRDD[VD2])(f: (VertexId, VD, Option[VD2]) => VD3): VertexRDD[VD3] = {
     val newPartitionsRDD = partitionsRDD.zipPartitions(
       other.partitionsRDD, preservesPartitioning = true
     ) { (thisIter, otherIter) =>
@@ -200,8 +200,8 @@ class VertexRDD[@specialized VD: ClassTag](
    * by `f`.
    */
   def leftJoin[VD2: ClassTag, VD3: ClassTag]
-      (other: RDD[(VertexID, VD2)])
-      (f: (VertexID, VD, Option[VD2]) => VD3)
+      (other: RDD[(VertexId, VD2)])
+      (f: (VertexId, VD, Option[VD2]) => VD3)
     : VertexRDD[VD3] = {
     // Test if the other vertex is a VertexRDD to choose the optimal join strategy.
     // If the other set is a VertexRDD then we use the much more efficient leftZipJoin
@@ -225,7 +225,7 @@ class VertexRDD[@specialized VD: ClassTag](
    * [[innerJoin]] for the behavior of the join.
    */
   def innerZipJoin[U: ClassTag, VD2: ClassTag](other: VertexRDD[U])
-      (f: (VertexID, VD, U) => VD2): VertexRDD[VD2] = {
+      (f: (VertexId, VD, U) => VD2): VertexRDD[VD2] = {
     val newPartitionsRDD = partitionsRDD.zipPartitions(
       other.partitionsRDD, preservesPartitioning = true
     ) { (thisIter, otherIter) =>
@@ -247,8 +247,8 @@ class VertexRDD[@specialized VD: ClassTag](
    * @return a VertexRDD co-indexed with `this`, containing only vertices that appear in both `this`
    * and `other`, with values supplied by `f`
    */
-  def innerJoin[U: ClassTag, VD2: ClassTag](other: RDD[(VertexID, U)])
-      (f: (VertexID, VD, U) => VD2): VertexRDD[VD2] = {
+  def innerJoin[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)])
+      (f: (VertexId, VD, U) => VD2): VertexRDD[VD2] = {
     // Test if the other vertex is a VertexRDD to choose the optimal join strategy.
     // If the other set is a VertexRDD then we use the much more efficient innerZipJoin
     other match {
@@ -278,7 +278,7 @@ class VertexRDD[@specialized VD: ClassTag](
    * messages.
    */
   def aggregateUsingIndex[VD2: ClassTag](
-      messages: RDD[(VertexID, VD2)], reduceFunc: (VD2, VD2) => VD2): VertexRDD[VD2] = {
+      messages: RDD[(VertexId, VD2)], reduceFunc: (VD2, VD2) => VD2): VertexRDD[VD2] = {
     val shuffled = MsgRDDFunctions.partitionForAggregation(messages, this.partitioner.get)
     val parts = partitionsRDD.zipPartitions(shuffled, true) { (thisIter, msgIter) =>
       val vertexPartition: VertexPartition[VD] = thisIter.next()
@@ -303,8 +303,8 @@ object VertexRDD {
    *
    * @param rdd the collection of vertex-attribute pairs
    */
-  def apply[VD: ClassTag](rdd: RDD[(VertexID, VD)]): VertexRDD[VD] = {
-    val partitioned: RDD[(VertexID, VD)] = rdd.partitioner match {
+  def apply[VD: ClassTag](rdd: RDD[(VertexId, VD)]): VertexRDD[VD] = {
+    val partitioned: RDD[(VertexId, VD)] = rdd.partitioner match {
       case Some(p) => rdd
       case None => rdd.partitionBy(new HashPartitioner(rdd.partitions.size))
     }
@@ -323,8 +323,8 @@ object VertexRDD {
    * @param rdd the collection of vertex-attribute pairs
    * @param mergeFunc the associative, commutative merge function.
    */
-  def apply[VD: ClassTag](rdd: RDD[(VertexID, VD)], mergeFunc: (VD, VD) => VD): VertexRDD[VD] = {
-    val partitioned: RDD[(VertexID, VD)] = rdd.partitioner match {
+  def apply[VD: ClassTag](rdd: RDD[(VertexId, VD)], mergeFunc: (VD, VD) => VD): VertexRDD[VD] = {
+    val partitioned: RDD[(VertexId, VD)] = rdd.partitioner match {
       case Some(p) => rdd
       case None => rdd.partitionBy(new HashPartitioner(rdd.partitions.size))
     }
@@ -338,7 +338,7 @@ object VertexRDD {
    * Constructs a VertexRDD from the vertex IDs in `vids`, taking attributes from `rdd` and using
    * `defaultVal` otherwise.
    */
-  def apply[VD: ClassTag](vids: RDD[VertexID], rdd: RDD[(VertexID, VD)], defaultVal: VD)
+  def apply[VD: ClassTag](vids: RDD[VertexId], rdd: RDD[(VertexId, VD)], defaultVal: VD)
     : VertexRDD[VD] = {
     VertexRDD(vids.map(vid => (vid, defaultVal))).leftJoin(rdd) { (vid, default, value) =>
       value.getOrElse(default)
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala
index ee95ead3ada9b..57fa5eefd5e09 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.impl
 
 import scala.reflect.ClassTag
@@ -17,10 +34,10 @@ import org.apache.spark.graphx.util.collection.PrimitiveKeyOpenHashMap
  */
 private[graphx]
 class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED: ClassTag](
-    val srcIds: Array[VertexID],
-    val dstIds: Array[VertexID],
+    val srcIds: Array[VertexId],
+    val dstIds: Array[VertexId],
     val data: Array[ED],
-    val index: PrimitiveKeyOpenHashMap[VertexID, Int]) extends Serializable {
+    val index: PrimitiveKeyOpenHashMap[VertexId, Int]) extends Serializable {
 
   /**
    * Reverse all the edges in this partition.
@@ -101,8 +118,8 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
    */
   def groupEdges(merge: (ED, ED) => ED): EdgePartition[ED] = {
     val builder = new EdgePartitionBuilder[ED]
-    var currSrcId: VertexID = null.asInstanceOf[VertexID]
-    var currDstId: VertexID = null.asInstanceOf[VertexID]
+    var currSrcId: VertexId = null.asInstanceOf[VertexId]
+    var currDstId: VertexId = null.asInstanceOf[VertexId]
     var currAttr: ED = null.asInstanceOf[ED]
     var i = 0
     while (i < size) {
@@ -136,7 +153,7 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
    */
   def innerJoin[ED2: ClassTag, ED3: ClassTag]
       (other: EdgePartition[ED2])
-      (f: (VertexID, VertexID, ED, ED2) => ED3): EdgePartition[ED3] = {
+      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgePartition[ED3] = {
     val builder = new EdgePartitionBuilder[ED3]
     var i = 0
     var j = 0
@@ -193,14 +210,14 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double)
    * iterator is generated using an index scan, so it is efficient at skipping edges that don't
    * match srcIdPred.
    */
-  def indexIterator(srcIdPred: VertexID => Boolean): Iterator[Edge[ED]] =
+  def indexIterator(srcIdPred: VertexId => Boolean): Iterator[Edge[ED]] =
     index.iterator.filter(kv => srcIdPred(kv._1)).flatMap(Function.tupled(clusterIterator))
 
   /**
    * Get an iterator over the cluster of edges in this partition with source vertex id `srcId`. The
    * cluster must start at position `index`.
    */
-  private def clusterIterator(srcId: VertexID, index: Int) = new Iterator[Edge[ED]] {
+  private def clusterIterator(srcId: VertexId, index: Int) = new Iterator[Edge[ED]] {
     private[this] val edge = new Edge[ED]
     private[this] var pos = index
 
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala
index 9d072f933503c..63ccccb056b48 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.impl
 
 import scala.reflect.ClassTag
@@ -12,22 +29,22 @@ class EdgePartitionBuilder[@specialized(Long, Int, Double) ED: ClassTag](size: I
   var edges = new PrimitiveVector[Edge[ED]](size)
 
   /** Add a new edge to the partition. */
-  def add(src: VertexID, dst: VertexID, d: ED) {
+  def add(src: VertexId, dst: VertexId, d: ED) {
     edges += Edge(src, dst, d)
   }
 
   def toEdgePartition: EdgePartition[ED] = {
     val edgeArray = edges.trim().array
     Sorting.quickSort(edgeArray)(Edge.lexicographicOrdering)
-    val srcIds = new Array[VertexID](edgeArray.size)
-    val dstIds = new Array[VertexID](edgeArray.size)
+    val srcIds = new Array[VertexId](edgeArray.size)
+    val dstIds = new Array[VertexId](edgeArray.size)
     val data = new Array[ED](edgeArray.size)
-    val index = new PrimitiveKeyOpenHashMap[VertexID, Int]
+    val index = new PrimitiveKeyOpenHashMap[VertexId, Int]
     // Copy edges into columnar structures, tracking the beginnings of source vertex id clusters and
     // adding them to the index
     if (edgeArray.length > 0) {
       index.update(srcIds(0), 0)
-      var currSrcId: VertexID = srcIds(0)
+      var currSrcId: VertexId = srcIds(0)
       var i = 0
       while (i < edgeArray.size) {
         srcIds(i) = edgeArray(i).srcId
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeTripletIterator.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeTripletIterator.scala
index bad840f1cdf36..886c250d7cffd 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeTripletIterator.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeTripletIterator.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.impl
 
 import scala.reflect.ClassTag
@@ -24,7 +41,7 @@ class EdgeTripletIterator[VD: ClassTag, ED: ClassTag](
   // allocating too many temporary Java objects.
   private val triplet = new EdgeTriplet[VD, ED]
 
-  private val vmap = new PrimitiveKeyOpenHashMap[VertexID, VD](vidToIndex, vertexArray)
+  private val vmap = new PrimitiveKeyOpenHashMap[VertexId, VD](vidToIndex, vertexArray)
 
   override def hasNext: Boolean = pos < edgePartition.size
 
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
index 56d1d9efeafa9..1d029bf009e8c 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.impl
 
 import scala.reflect.{classTag, ClassTag}
@@ -88,7 +105,7 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected (
     new GraphImpl(vertices, newETable, routingTable, replicatedVertexView)
   }
 
-  override def mapVertices[VD2: ClassTag](f: (VertexID, VD) => VD2): Graph[VD2, ED] = {
+  override def mapVertices[VD2: ClassTag](f: (VertexId, VD) => VD2): Graph[VD2, ED] = {
     if (classTag[VD] equals classTag[VD2]) {
       // The map preserves type, so we can use incremental replication
       val newVerts = vertices.mapVertexPartitions(_.map(f)).cache()
@@ -136,7 +153,7 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected (
 
   override def subgraph(
       epred: EdgeTriplet[VD, ED] => Boolean = x => true,
-      vpred: (VertexID, VD) => Boolean = (a, b) => true): Graph[VD, ED] = {
+      vpred: (VertexId, VD) => Boolean = (a, b) => true): Graph[VD, ED] = {
     // Filter the vertices, reusing the partitioner and the index from this graph
     val newVerts = vertices.mapVertexPartitions(_.filter(vpred))
 
@@ -178,7 +195,7 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected (
   //////////////////////////////////////////////////////////////////////////////////////////////////
 
   override def mapReduceTriplets[A: ClassTag](
-      mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexID, A)],
+      mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)],
       reduceFunc: (A, A) => A,
       activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None) = {
 
@@ -208,7 +225,7 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected (
       val edgeIter = activeDirectionOpt match {
         case Some(EdgeDirection.Both) =>
           if (activeFraction < 0.8) {
-            edgePartition.indexIterator(srcVertexID => vPart.isActive(srcVertexID))
+            edgePartition.indexIterator(srcVertexId => vPart.isActive(srcVertexId))
               .filter(e => vPart.isActive(e.dstId))
           } else {
             edgePartition.iterator.filter(e => vPart.isActive(e.srcId) && vPart.isActive(e.dstId))
@@ -219,7 +236,7 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected (
           edgePartition.iterator.filter(e => vPart.isActive(e.srcId) || vPart.isActive(e.dstId))
         case Some(EdgeDirection.Out) =>
           if (activeFraction < 0.8) {
-            edgePartition.indexIterator(srcVertexID => vPart.isActive(srcVertexID))
+            edgePartition.indexIterator(srcVertexId => vPart.isActive(srcVertexId))
           } else {
             edgePartition.iterator.filter(e => vPart.isActive(e.srcId))
           }
@@ -250,8 +267,8 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected (
   } // end of mapReduceTriplets
 
   override def outerJoinVertices[U: ClassTag, VD2: ClassTag]
-      (other: RDD[(VertexID, U)])
-      (updateF: (VertexID, VD, Option[U]) => VD2): Graph[VD2, ED] =
+      (other: RDD[(VertexId, U)])
+      (updateF: (VertexId, VD, Option[U]) => VD2): Graph[VD2, ED] =
   {
     if (classTag[VD] equals classTag[VD2]) {
       // updateF preserves type, so we can use incremental replication
@@ -295,7 +312,7 @@ object GraphImpl {
   }
 
   def apply[VD: ClassTag, ED: ClassTag](
-      vertices: RDD[(VertexID, VD)],
+      vertices: RDD[(VertexId, VD)],
       edges: RDD[Edge[ED]],
       defaultVertexAttr: VD): GraphImpl[VD, ED] =
   {
@@ -304,7 +321,7 @@ object GraphImpl {
     // Get the set of all vids
     val partitioner = Partitioner.defaultPartitioner(vertices)
     val vPartitioned = vertices.partitionBy(partitioner)
-    val vidsFromEdges = collectVertexIDsFromEdges(edgeRDD, partitioner)
+    val vidsFromEdges = collectVertexIdsFromEdges(edgeRDD, partitioner)
     val vids = vPartitioned.zipPartitions(vidsFromEdges) { (vertexIter, vidsFromEdgesIter) =>
       vertexIter.map(_._1) ++ vidsFromEdgesIter.map(_._1)
     }
@@ -338,7 +355,7 @@ object GraphImpl {
 
   /**
    * Create the edge RDD, which is much more efficient for Java heap storage than the normal edges
-   * data structure (RDD[(VertexID, VertexID, ED)]).
+   * data structure (RDD[(VertexId, VertexId, ED)]).
    *
    * The edge RDD contains multiple partitions, and each partition contains only one RDD key-value
    * pair: the key is the partition id, and the value is an EdgePartition object containing all the
@@ -361,19 +378,19 @@ object GraphImpl {
       defaultVertexAttr: VD): GraphImpl[VD, ED] = {
     edges.cache()
     // Get the set of all vids
-    val vids = collectVertexIDsFromEdges(edges, new HashPartitioner(edges.partitions.size))
+    val vids = collectVertexIdsFromEdges(edges, new HashPartitioner(edges.partitions.size))
     // Create the VertexRDD.
     val vertices = VertexRDD(vids.mapValues(x => defaultVertexAttr))
     GraphImpl(vertices, edges)
   }
 
   /** Collects all vids mentioned in edges and partitions them by partitioner. */
-  private def collectVertexIDsFromEdges(
+  private def collectVertexIdsFromEdges(
       edges: EdgeRDD[_],
-      partitioner: Partitioner): RDD[(VertexID, Int)] = {
+      partitioner: Partitioner): RDD[(VertexId, Int)] = {
     // TODO: Consider doing map side distinct before shuffle.
-    new ShuffledRDD[VertexID, Int, (VertexID, Int)](
-      edges.collectVertexIDs.map(vid => (vid, 0)), partitioner)
-      .setSerializer(classOf[VertexIDMsgSerializer].getName)
+    new ShuffledRDD[VertexId, Int, (VertexId, Int)](
+      edges.collectVertexIds.map(vid => (vid, 0)), partitioner)
+      .setSerializer(classOf[VertexIdMsgSerializer].getName)
   }
 } // end of object GraphImpl
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/MessageToPartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/MessageToPartition.scala
index 05508ff716eb1..e9ee09c3614c1 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/MessageToPartition.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/MessageToPartition.scala
@@ -1,18 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.impl
 
 import scala.reflect.{classTag, ClassTag}
 
 import org.apache.spark.Partitioner
-import org.apache.spark.graphx.{PartitionID, VertexID}
+import org.apache.spark.graphx.{PartitionID, VertexId}
 import org.apache.spark.rdd.{ShuffledRDD, RDD}
 
 
 private[graphx]
 class VertexBroadcastMsg[@specialized(Int, Long, Double, Boolean) T](
     @transient var partition: PartitionID,
-    var vid: VertexID,
+    var vid: VertexId,
     var data: T)
-  extends Product2[PartitionID, (VertexID, T)] with Serializable {
+  extends Product2[PartitionID, (VertexId, T)] with Serializable {
 
   override def _1 = partition
 
@@ -44,7 +61,7 @@ class MessageToPartition[@specialized(Int, Long, Double, Char, Boolean/*, AnyRef
 private[graphx]
 class VertexBroadcastMsgRDDFunctions[T: ClassTag](self: RDD[VertexBroadcastMsg[T]]) {
   def partitionBy(partitioner: Partitioner): RDD[VertexBroadcastMsg[T]] = {
-    val rdd = new ShuffledRDD[PartitionID, (VertexID, T), VertexBroadcastMsg[T]](self, partitioner)
+    val rdd = new ShuffledRDD[PartitionID, (VertexId, T), VertexBroadcastMsg[T]](self, partitioner)
 
     // Set a custom serializer if the data is of int or double type.
     if (classTag[T] == ClassTag.Int) {
@@ -82,8 +99,8 @@ object MsgRDDFunctions {
     new VertexBroadcastMsgRDDFunctions(rdd)
   }
 
-  def partitionForAggregation[T: ClassTag](msgs: RDD[(VertexID, T)], partitioner: Partitioner) = {
-    val rdd = new ShuffledRDD[VertexID, T, (VertexID, T)](msgs, partitioner)
+  def partitionForAggregation[T: ClassTag](msgs: RDD[(VertexId, T)], partitioner: Partitioner) = {
+    val rdd = new ShuffledRDD[VertexId, T, (VertexId, T)](msgs, partitioner)
 
     // Set a custom serializer if the data is of int or double type.
     if (classTag[T] == ClassTag.Int) {
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/ReplicatedVertexView.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/ReplicatedVertexView.scala
index 4ebe0b02671d9..a8154b63ce5fb 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/ReplicatedVertexView.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/ReplicatedVertexView.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.impl
 
 import scala.reflect.{classTag, ClassTag}
@@ -33,9 +50,9 @@ class ReplicatedVertexView[VD: ClassTag](
    * vids from both the source and destination of edges. It must always include both source and
    * destination vids because some operations, such as GraphImpl.mapReduceTriplets, rely on this.
    */
-  private val localVertexIDMap: RDD[(Int, VertexIdToIndexMap)] = prevViewOpt match {
+  private val localVertexIdMap: RDD[(Int, VertexIdToIndexMap)] = prevViewOpt match {
     case Some(prevView) =>
-      prevView.localVertexIDMap
+      prevView.localVertexIdMap
     case None =>
       edges.partitionsRDD.mapPartitions(_.map {
         case (pid, epart) =>
@@ -45,7 +62,7 @@ class ReplicatedVertexView[VD: ClassTag](
             vidToIndex.add(e.dstId)
           }
           (pid, vidToIndex)
-      }, preservesPartitioning = true).cache().setName("ReplicatedVertexView localVertexIDMap")
+      }, preservesPartitioning = true).cache().setName("ReplicatedVertexView localVertexIdMap")
   }
 
   private lazy val bothAttrs: RDD[(PartitionID, VertexPartition[VD])] = create(true, true)
@@ -58,7 +75,7 @@ class ReplicatedVertexView[VD: ClassTag](
     srcAttrOnly.unpersist(blocking)
     dstAttrOnly.unpersist(blocking)
     noAttrs.unpersist(blocking)
-    // Don't unpersist localVertexIDMap because a future ReplicatedVertexView may be using it
+    // Don't unpersist localVertexIdMap because a future ReplicatedVertexView may be using it
     // without modification
     this
   }
@@ -116,8 +133,8 @@ class ReplicatedVertexView[VD: ClassTag](
 
       case None =>
         // Within each edge partition, place the shipped vertex attributes into the correct
-        // locations specified in localVertexIDMap
-        localVertexIDMap.zipPartitions(shippedVerts) { (mapIter, shippedVertsIter) =>
+        // locations specified in localVertexIdMap
+        localVertexIdMap.zipPartitions(shippedVerts) { (mapIter, shippedVertsIter) =>
           val (pid, vidToIndex) = mapIter.next()
           assert(!mapIter.hasNext)
           // Populate the vertex array using the vidToIndex map
@@ -140,15 +157,15 @@ class ReplicatedVertexView[VD: ClassTag](
 
 private object ReplicatedVertexView {
   protected def buildBuffer[VD: ClassTag](
-      pid2vidIter: Iterator[Array[Array[VertexID]]],
+      pid2vidIter: Iterator[Array[Array[VertexId]]],
       vertexPartIter: Iterator[VertexPartition[VD]]) = {
-    val pid2vid: Array[Array[VertexID]] = pid2vidIter.next()
+    val pid2vid: Array[Array[VertexId]] = pid2vidIter.next()
     val vertexPart: VertexPartition[VD] = vertexPartIter.next()
 
     Iterator.tabulate(pid2vid.size) { pid =>
       val vidsCandidate = pid2vid(pid)
       val size = vidsCandidate.length
-      val vids = new PrimitiveVector[VertexID](pid2vid(pid).size)
+      val vids = new PrimitiveVector[VertexId](pid2vid(pid).size)
       val attrs = new PrimitiveVector[VD](pid2vid(pid).size)
       var i = 0
       while (i < size) {
@@ -164,16 +181,16 @@ private object ReplicatedVertexView {
   }
 
   protected def buildActiveBuffer(
-      pid2vidIter: Iterator[Array[Array[VertexID]]],
+      pid2vidIter: Iterator[Array[Array[VertexId]]],
       activePartIter: Iterator[VertexPartition[_]])
-    : Iterator[(Int, Array[VertexID])] = {
-    val pid2vid: Array[Array[VertexID]] = pid2vidIter.next()
+    : Iterator[(Int, Array[VertexId])] = {
+    val pid2vid: Array[Array[VertexId]] = pid2vidIter.next()
     val activePart: VertexPartition[_] = activePartIter.next()
 
     Iterator.tabulate(pid2vid.size) { pid =>
       val vidsCandidate = pid2vid(pid)
       val size = vidsCandidate.length
-      val actives = new PrimitiveVector[VertexID](vidsCandidate.size)
+      val actives = new PrimitiveVector[VertexId](vidsCandidate.size)
       var i = 0
       while (i < size) {
         val vid = vidsCandidate(i)
@@ -188,8 +205,8 @@ private object ReplicatedVertexView {
 }
 
 private[graphx]
-class VertexAttributeBlock[VD: ClassTag](val vids: Array[VertexID], val attrs: Array[VD])
+class VertexAttributeBlock[VD: ClassTag](val vids: Array[VertexId], val attrs: Array[VD])
   extends Serializable {
-  def iterator: Iterator[(VertexID, VD)] =
+  def iterator: Iterator[(VertexId, VD)] =
     (0 until vids.size).iterator.map { i => (vids(i), attrs(i)) }
 }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTable.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTable.scala
index f342fd7437903..fe44e1ee0c391 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTable.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTable.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.impl
 
 import org.apache.spark.SparkContext._
@@ -15,12 +32,12 @@ import org.apache.spark.util.collection.PrimitiveVector
 private[impl]
 class RoutingTable(edges: EdgeRDD[_], vertices: VertexRDD[_]) {
 
-  val bothAttrs: RDD[Array[Array[VertexID]]] = createPid2Vid(true, true)
-  val srcAttrOnly: RDD[Array[Array[VertexID]]] = createPid2Vid(true, false)
-  val dstAttrOnly: RDD[Array[Array[VertexID]]] = createPid2Vid(false, true)
-  val noAttrs: RDD[Array[Array[VertexID]]] = createPid2Vid(false, false)
+  val bothAttrs: RDD[Array[Array[VertexId]]] = createPid2Vid(true, true)
+  val srcAttrOnly: RDD[Array[Array[VertexId]]] = createPid2Vid(true, false)
+  val dstAttrOnly: RDD[Array[Array[VertexId]]] = createPid2Vid(false, true)
+  val noAttrs: RDD[Array[Array[VertexId]]] = createPid2Vid(false, false)
 
-  def get(includeSrcAttr: Boolean, includeDstAttr: Boolean): RDD[Array[Array[VertexID]]] =
+  def get(includeSrcAttr: Boolean, includeDstAttr: Boolean): RDD[Array[Array[VertexId]]] =
     (includeSrcAttr, includeDstAttr) match {
       case (true, true) => bothAttrs
       case (true, false) => srcAttrOnly
@@ -29,9 +46,9 @@ class RoutingTable(edges: EdgeRDD[_], vertices: VertexRDD[_]) {
     }
 
   private def createPid2Vid(
-      includeSrcAttr: Boolean, includeDstAttr: Boolean): RDD[Array[Array[VertexID]]] = {
+      includeSrcAttr: Boolean, includeDstAttr: Boolean): RDD[Array[Array[VertexId]]] = {
     // Determine which vertices each edge partition needs by creating a mapping from vid to pid.
-    val vid2pid: RDD[(VertexID, PartitionID)] = edges.partitionsRDD.mapPartitions { iter =>
+    val vid2pid: RDD[(VertexId, PartitionID)] = edges.partitionsRDD.mapPartitions { iter =>
       val (pid: PartitionID, edgePartition: EdgePartition[_]) = iter.next()
       val numEdges = edgePartition.size
       val vSet = new VertexSet
@@ -54,7 +71,7 @@ class RoutingTable(edges: EdgeRDD[_], vertices: VertexRDD[_]) {
 
     val numPartitions = vertices.partitions.size
     vid2pid.partitionBy(vertices.partitioner.get).mapPartitions { iter =>
-      val pid2vid = Array.fill(numPartitions)(new PrimitiveVector[VertexID])
+      val pid2vid = Array.fill(numPartitions)(new PrimitiveVector[VertexId])
       for ((vid, pid) <- iter) {
         pid2vid(pid) += vid
       }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/Serializers.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/Serializers.scala
index cbd6318f33cdc..c74d487e206db 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/Serializers.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/Serializers.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.impl
 
 import java.io.{EOFException, InputStream, OutputStream}
@@ -8,12 +25,12 @@ import org.apache.spark.graphx._
 import org.apache.spark.serializer._
 
 private[graphx]
-class VertexIDMsgSerializer(conf: SparkConf) extends Serializer {
+class VertexIdMsgSerializer(conf: SparkConf) extends Serializer {
   override def newInstance(): SerializerInstance = new ShuffleSerializerInstance {
 
     override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) {
       def writeObject[T](t: T) = {
-        val msg = t.asInstanceOf[(VertexID, _)]
+        val msg = t.asInstanceOf[(VertexId, _)]
         writeVarLong(msg._1, optimizePositive = false)
         this
       }
@@ -106,7 +123,7 @@ class IntAggMsgSerializer(conf: SparkConf) extends Serializer {
 
     override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) {
       def writeObject[T](t: T) = {
-        val msg = t.asInstanceOf[(VertexID, Int)]
+        val msg = t.asInstanceOf[(VertexId, Int)]
         writeVarLong(msg._1, optimizePositive = false)
         writeUnsignedVarInt(msg._2)
         this
@@ -130,7 +147,7 @@ class LongAggMsgSerializer(conf: SparkConf) extends Serializer {
 
     override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) {
       def writeObject[T](t: T) = {
-        val msg = t.asInstanceOf[(VertexID, Long)]
+        val msg = t.asInstanceOf[(VertexId, Long)]
         writeVarLong(msg._1, optimizePositive = false)
         writeVarLong(msg._2, optimizePositive = true)
         this
@@ -154,7 +171,7 @@ class DoubleAggMsgSerializer(conf: SparkConf) extends Serializer {
 
     override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) {
       def writeObject[T](t: T) = {
-        val msg = t.asInstanceOf[(VertexID, Double)]
+        val msg = t.asInstanceOf[(VertexId, Double)]
         writeVarLong(msg._1, optimizePositive = false)
         writeDouble(msg._2)
         this
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala
index f97ff75fb2f93..7a54b413dc8ca 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.impl
 
 import scala.reflect.ClassTag
@@ -9,18 +26,18 @@ import org.apache.spark.util.collection.BitSet
 
 private[graphx] object VertexPartition {
 
-  def apply[VD: ClassTag](iter: Iterator[(VertexID, VD)]): VertexPartition[VD] = {
-    val map = new PrimitiveKeyOpenHashMap[VertexID, VD]
+  def apply[VD: ClassTag](iter: Iterator[(VertexId, VD)]): VertexPartition[VD] = {
+    val map = new PrimitiveKeyOpenHashMap[VertexId, VD]
     iter.foreach { case (k, v) =>
       map(k) = v
     }
     new VertexPartition(map.keySet, map._values, map.keySet.getBitSet)
   }
 
-  def apply[VD: ClassTag](iter: Iterator[(VertexID, VD)], mergeFunc: (VD, VD) => VD)
+  def apply[VD: ClassTag](iter: Iterator[(VertexId, VD)], mergeFunc: (VD, VD) => VD)
     : VertexPartition[VD] =
   {
-    val map = new PrimitiveKeyOpenHashMap[VertexID, VD]
+    val map = new PrimitiveKeyOpenHashMap[VertexId, VD]
     iter.foreach { case (k, v) =>
       map.setMerge(k, v, mergeFunc)
     }
@@ -43,15 +60,15 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag](
   def size: Int = mask.cardinality()
 
   /** Return the vertex attribute for the given vertex ID. */
-  def apply(vid: VertexID): VD = values(index.getPos(vid))
+  def apply(vid: VertexId): VD = values(index.getPos(vid))
 
-  def isDefined(vid: VertexID): Boolean = {
+  def isDefined(vid: VertexId): Boolean = {
     val pos = index.getPos(vid)
     pos >= 0 && mask.get(pos)
   }
 
   /** Look up vid in activeSet, throwing an exception if it is None. */
-  def isActive(vid: VertexID): Boolean = {
+  def isActive(vid: VertexId): Boolean = {
     activeSet.get.contains(vid)
   }
 
@@ -71,7 +88,7 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag](
    * each of the entries in the original VertexRDD.  The resulting
    * VertexPartition retains the same index.
    */
-  def map[VD2: ClassTag](f: (VertexID, VD) => VD2): VertexPartition[VD2] = {
+  def map[VD2: ClassTag](f: (VertexId, VD) => VD2): VertexPartition[VD2] = {
     // Construct a view of the map transformation
     val newValues = new Array[VD2](capacity)
     var i = mask.nextSetBit(0)
@@ -91,7 +108,7 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag](
    *       RDD can be easily joined with the original vertex-set. Furthermore, the filter only
    *       modifies the bitmap index and so no new values are allocated.
    */
-  def filter(pred: (VertexID, VD) => Boolean): VertexPartition[VD] = {
+  def filter(pred: (VertexId, VD) => Boolean): VertexPartition[VD] = {
     // Allocate the array to store the results into
     val newMask = new BitSet(capacity)
     // Iterate over the active bits in the old mask and evaluate the predicate
@@ -129,7 +146,7 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag](
   /** Left outer join another VertexPartition. */
   def leftJoin[VD2: ClassTag, VD3: ClassTag]
       (other: VertexPartition[VD2])
-      (f: (VertexID, VD, Option[VD2]) => VD3): VertexPartition[VD3] = {
+      (f: (VertexId, VD, Option[VD2]) => VD3): VertexPartition[VD3] = {
     if (index != other.index) {
       logWarning("Joining two VertexPartitions with different indexes is slow.")
       leftJoin(createUsingIndex(other.iterator))(f)
@@ -148,14 +165,14 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag](
 
   /** Left outer join another iterator of messages. */
   def leftJoin[VD2: ClassTag, VD3: ClassTag]
-      (other: Iterator[(VertexID, VD2)])
-      (f: (VertexID, VD, Option[VD2]) => VD3): VertexPartition[VD3] = {
+      (other: Iterator[(VertexId, VD2)])
+      (f: (VertexId, VD, Option[VD2]) => VD3): VertexPartition[VD3] = {
     leftJoin(createUsingIndex(other))(f)
   }
 
   /** Inner join another VertexPartition. */
   def innerJoin[U: ClassTag, VD2: ClassTag](other: VertexPartition[U])
-      (f: (VertexID, VD, U) => VD2): VertexPartition[VD2] = {
+      (f: (VertexId, VD, U) => VD2): VertexPartition[VD2] = {
     if (index != other.index) {
       logWarning("Joining two VertexPartitions with different indexes is slow.")
       innerJoin(createUsingIndex(other.iterator))(f)
@@ -175,15 +192,15 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag](
    * Inner join an iterator of messages.
    */
   def innerJoin[U: ClassTag, VD2: ClassTag]
-      (iter: Iterator[Product2[VertexID, U]])
-      (f: (VertexID, VD, U) => VD2): VertexPartition[VD2] = {
+      (iter: Iterator[Product2[VertexId, U]])
+      (f: (VertexId, VD, U) => VD2): VertexPartition[VD2] = {
     innerJoin(createUsingIndex(iter))(f)
   }
 
   /**
    * Similar effect as aggregateUsingIndex((a, b) => a)
    */
-  def createUsingIndex[VD2: ClassTag](iter: Iterator[Product2[VertexID, VD2]])
+  def createUsingIndex[VD2: ClassTag](iter: Iterator[Product2[VertexId, VD2]])
     : VertexPartition[VD2] = {
     val newMask = new BitSet(capacity)
     val newValues = new Array[VD2](capacity)
@@ -201,7 +218,7 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag](
    * Similar to innerJoin, but vertices from the left side that don't appear in iter will remain in
    * the partition, hidden by the bitmask.
    */
-  def innerJoinKeepLeft(iter: Iterator[Product2[VertexID, VD]]): VertexPartition[VD] = {
+  def innerJoinKeepLeft(iter: Iterator[Product2[VertexId, VD]]): VertexPartition[VD] = {
     val newMask = new BitSet(capacity)
     val newValues = new Array[VD](capacity)
     System.arraycopy(values, 0, newValues, 0, newValues.length)
@@ -216,7 +233,7 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag](
   }
 
   def aggregateUsingIndex[VD2: ClassTag](
-      iter: Iterator[Product2[VertexID, VD2]],
+      iter: Iterator[Product2[VertexId, VD2]],
       reduceFunc: (VD2, VD2) => VD2): VertexPartition[VD2] = {
     val newMask = new BitSet(capacity)
     val newValues = new Array[VD2](capacity)
@@ -236,7 +253,7 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag](
     new VertexPartition[VD2](index, newValues, newMask)
   }
 
-  def replaceActives(iter: Iterator[VertexID]): VertexPartition[VD] = {
+  def replaceActives(iter: Iterator[VertexId]): VertexPartition[VD] = {
     val newActiveSet = new VertexSet
     iter.foreach(newActiveSet.add(_))
     new VertexPartition(index, values, mask, Some(newActiveSet))
@@ -246,7 +263,7 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag](
    * Construct a new VertexPartition whose index contains only the vertices in the mask.
    */
   def reindex(): VertexPartition[VD] = {
-    val hashMap = new PrimitiveKeyOpenHashMap[VertexID, VD]
+    val hashMap = new PrimitiveKeyOpenHashMap[VertexId, VD]
     val arbitraryMerge = (a: VD, b: VD) => a
     for ((k, v) <- this.iterator) {
       hashMap.setMerge(k, v, arbitraryMerge)
@@ -254,8 +271,8 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag](
     new VertexPartition(hashMap.keySet, hashMap._values, hashMap.keySet.getBitSet)
   }
 
-  def iterator: Iterator[(VertexID, VD)] =
+  def iterator: Iterator[(VertexId, VD)] =
     mask.iterator.map(ind => (index.getValue(ind), values(ind)))
 
-  def vidIterator: Iterator[VertexID] = mask.iterator.map(ind => index.getValue(ind))
+  def vidIterator: Iterator[VertexId] = mask.iterator.map(ind => index.getValue(ind))
 }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/package.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/package.scala
index cfc3281b6407e..79549fe060457 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/impl/package.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/package.scala
@@ -1,7 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 import org.apache.spark.util.collection.OpenHashSet
 
 package object impl {
-  private[graphx] type VertexIdToIndexMap = OpenHashSet[VertexID]
+  private[graphx] type VertexIdToIndexMap = OpenHashSet[VertexId]
 }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/Analytics.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/Analytics.scala
index e0aff5644e40d..f914e0565ca21 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/Analytics.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/Analytics.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.lib
 
 import org.apache.spark._
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/ConnectedComponents.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/ConnectedComponents.scala
index 4d1f5e74df59f..e2f6cc138958e 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/ConnectedComponents.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/ConnectedComponents.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.lib
 
 import scala.reflect.ClassTag
@@ -18,9 +35,9 @@ object ConnectedComponents {
    * @return a graph with vertex attributes containing the smallest vertex in each
    *         connected component
    */
-  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Graph[VertexID, ED] = {
+  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Graph[VertexId, ED] = {
     val ccGraph = graph.mapVertices { case (vid, _) => vid }
-    def sendMessage(edge: EdgeTriplet[VertexID, ED]) = {
+    def sendMessage(edge: EdgeTriplet[VertexId, ED]) = {
       if (edge.srcAttr < edge.dstAttr) {
         Iterator((edge.dstId, edge.srcAttr))
       } else if (edge.srcAttr > edge.dstAttr) {
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
index 2f4d6d686499a..614555a054dfb 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.lib
 
 import scala.reflect.ClassTag
@@ -75,7 +92,7 @@ object PageRank extends Logging {
 
     // Define the three functions needed to implement PageRank in the GraphX
     // version of Pregel
-    def vertexProgram(id: VertexID, attr: Double, msgSum: Double): Double =
+    def vertexProgram(id: VertexId, attr: Double, msgSum: Double): Double =
       resetProb + (1.0 - resetProb) * msgSum
     def sendMessage(edge: EdgeTriplet[Double, Double]) =
       Iterator((edge.dstId, edge.srcAttr * edge.attr))
@@ -120,7 +137,7 @@ object PageRank extends Logging {
 
     // Define the three functions needed to implement PageRank in the GraphX
     // version of Pregel
-    def vertexProgram(id: VertexID, attr: (Double, Double), msgSum: Double): (Double, Double) = {
+    def vertexProgram(id: VertexId, attr: (Double, Double), msgSum: Double): (Double, Double) = {
       val (oldPR, lastDelta) = attr
       val newPR = oldPR + (1.0 - resetProb) * msgSum
       (newPR, newPR - oldPR)
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
index ba6517e012d28..79280f836f21d 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala
@@ -1,7 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.lib
 
 import scala.util.Random
-import org.apache.commons.math.linear._
+import org.apache.commons.math3.linear._
 import org.apache.spark.rdd._
 import org.apache.spark.graphx._
 
@@ -62,13 +79,13 @@ object SVDPlusPlus {
       (g1: (Long, Double), g2: (Long, Double)) => (g1._1 + g2._1, g1._2 + g2._2))
 
     g = g.outerJoinVertices(t0) {
-      (vid: VertexID, vd: (RealVector, RealVector, Double, Double), msg: Option[(Long, Double)]) =>
+      (vid: VertexId, vd: (RealVector, RealVector, Double, Double), msg: Option[(Long, Double)]) =>
         (vd._1, vd._2, msg.get._2 / msg.get._1, 1.0 / scala.math.sqrt(msg.get._1))
     }
 
     def mapTrainF(conf: Conf, u: Double)
         (et: EdgeTriplet[(RealVector, RealVector, Double, Double), Double])
-      : Iterator[(VertexID, (RealVector, RealVector, Double))] = {
+      : Iterator[(VertexId, (RealVector, RealVector, Double))] = {
       val (usr, itm) = (et.srcAttr, et.dstAttr)
       val (p, q) = (usr._1, itm._1)
       var pred = u + usr._3 + itm._3 + q.dotProduct(usr._2)
@@ -95,7 +112,7 @@ object SVDPlusPlus {
         et => Iterator((et.srcId, et.dstAttr._2)),
         (g1: RealVector, g2: RealVector) => g1.add(g2))
       g = g.outerJoinVertices(t1) {
-        (vid: VertexID, vd: (RealVector, RealVector, Double, Double), msg: Option[RealVector]) =>
+        (vid: VertexId, vd: (RealVector, RealVector, Double, Double), msg: Option[RealVector]) =>
           if (msg.isDefined) (vd._1, vd._1.add(msg.get.mapMultiply(vd._4)), vd._3, vd._4) else vd
       }
 
@@ -106,7 +123,7 @@ object SVDPlusPlus {
         (g1: (RealVector, RealVector, Double), g2: (RealVector, RealVector, Double)) =>
           (g1._1.add(g2._1), g1._2.add(g2._2), g1._3 + g2._3))
       g = g.outerJoinVertices(t2) {
-        (vid: VertexID,
+        (vid: VertexId,
          vd: (RealVector, RealVector, Double, Double),
          msg: Option[(RealVector, RealVector, Double)]) =>
           (vd._1.add(msg.get._1), vd._2.add(msg.get._2), vd._3 + msg.get._3, vd._4)
@@ -116,7 +133,7 @@ object SVDPlusPlus {
     // calculate error on training set
     def mapTestF(conf: Conf, u: Double)
         (et: EdgeTriplet[(RealVector, RealVector, Double, Double), Double])
-      : Iterator[(VertexID, Double)] =
+      : Iterator[(VertexId, Double)] =
     {
       val (usr, itm) = (et.srcAttr, et.dstAttr)
       val (p, q) = (usr._1, itm._1)
@@ -129,7 +146,7 @@ object SVDPlusPlus {
     g.cache()
     val t3 = g.mapReduceTriplets(mapTestF(conf, u), (g1: Double, g2: Double) => g1 + g2)
     g = g.outerJoinVertices(t3) {
-      (vid: VertexID, vd: (RealVector, RealVector, Double, Double), msg: Option[Double]) =>
+      (vid: VertexId, vd: (RealVector, RealVector, Double, Double), msg: Option[Double]) =>
         if (msg.isDefined) (vd._1, vd._2, vd._3, msg.get) else vd
     }
 
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/StronglyConnectedComponents.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/StronglyConnectedComponents.scala
index d3d496e335481..46da38eeb725a 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/StronglyConnectedComponents.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/StronglyConnectedComponents.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.lib
 
 import scala.reflect.ClassTag
@@ -18,7 +35,7 @@ object StronglyConnectedComponents {
    *
    * @return a graph with vertex attributes containing the smallest vertex id in each SCC
    */
-  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], numIter: Int): Graph[VertexID, ED] = {
+  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], numIter: Int): Graph[VertexId, ED] = {
 
     // the graph we update with final SCC ids, and the graph we return at the end
     var sccGraph = graph.mapVertices { case (vid, _) => vid }
@@ -54,7 +71,7 @@ object StronglyConnectedComponents {
 
       // collect min of all my neighbor's scc values, update if it's smaller than mine
       // then notify any neighbors with scc values larger than mine
-      sccWorkGraph = Pregel[(VertexID, Boolean), ED, VertexID](
+      sccWorkGraph = Pregel[(VertexId, Boolean), ED, VertexId](
         sccWorkGraph, Long.MaxValue, activeDirection = EdgeDirection.Out)(
         (vid, myScc, neighborScc) => (math.min(myScc._1, neighborScc), myScc._2),
         e => {
@@ -68,7 +85,7 @@ object StronglyConnectedComponents {
 
       // start at root of SCCs. Traverse values in reverse, notify all my neighbors
       // do not propagate if colors do not match!
-      sccWorkGraph = Pregel[(VertexID, Boolean), ED, Boolean](
+      sccWorkGraph = Pregel[(VertexId, Boolean), ED, Boolean](
         sccWorkGraph, false, activeDirection = EdgeDirection.In)(
         // vertex is final if it is the root of a color
         // or it has the same color as a neighbor that is final
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala
index 23c9c40594e8b..7c396e6e66a28 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.lib
 
 import scala.reflect.ClassTag
@@ -44,7 +61,7 @@ object TriangleCount {
       (vid, _, optSet) => optSet.getOrElse(null)
     }
     // Edge function computes intersection of smaller vertex with larger vertex
-    def edgeFunc(et: EdgeTriplet[VertexSet, ED]): Iterator[(VertexID, Int)] = {
+    def edgeFunc(et: EdgeTriplet[VertexSet, ED]): Iterator[(VertexId, Int)] = {
       assert(et.srcAttr != null)
       assert(et.dstAttr != null)
       val (smallSet, largeSet) = if (et.srcAttr.size < et.dstAttr.size) {
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/package.scala b/graphx/src/main/scala/org/apache/spark/graphx/package.scala
index 60dfc1dc37a53..425a5164cad24 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/package.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/package.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark
 
 import org.apache.spark.util.collection.OpenHashSet
@@ -8,11 +25,11 @@ package object graphx {
    * A 64-bit vertex identifier that uniquely identifies a vertex within a graph. It does not need
    * to follow any ordering or any constraints other than uniqueness.
    */
-  type VertexID = Long
+  type VertexId = Long
 
   /** Integer identifer of a graph partition. */
   // TODO: Consider using Char.
   type PartitionID = Int
 
-  private[graphx] type VertexSet = OpenHashSet[VertexID]
+  private[graphx] type VertexSet = OpenHashSet[VertexId]
 }
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala b/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala
index 1c5b234d74791..d1528e2f07cf2 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.util
 
 import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
diff --git a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
index 57422ce3f1934..7677641bfede6 100644
--- a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
+++ b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.util
 
 import scala.annotation.tailrec
@@ -33,7 +50,7 @@ object GraphGenerators {
     val mu = 4
     val sigma = 1.3
 
-    val vertices: RDD[(VertexID, Int)] = sc.parallelize(0 until numVertices).map{
+    val vertices: RDD[(VertexId, Int)] = sc.parallelize(0 until numVertices).map{
       src => (src, sampleLogNormal(mu, sigma, numVertices))
     }
     val edges = vertices.flatMap { v =>
@@ -42,9 +59,9 @@ object GraphGenerators {
     Graph(vertices, edges, 0)
   }
 
-  def generateRandomEdges(src: Int, numEdges: Int, maxVertexID: Int): Array[Edge[Int]] = {
+  def generateRandomEdges(src: Int, numEdges: Int, maxVertexId: Int): Array[Edge[Int]] = {
     val rand = new Random()
-    Array.fill(maxVertexID) { Edge[Int](src, rand.nextInt(maxVertexID), 1) }
+    Array.fill(maxVertexId) { Edge[Int](src, rand.nextInt(maxVertexId), 1) }
   }
 
   /**
@@ -189,9 +206,9 @@ object GraphGenerators {
    */
   def gridGraph(sc: SparkContext, rows: Int, cols: Int): Graph[(Int,Int), Double] = {
     // Convert row column address into vertex ids (row major order)
-    def sub2ind(r: Int, c: Int): VertexID = r * cols + c
+    def sub2ind(r: Int, c: Int): VertexId = r * cols + c
 
-    val vertices: RDD[(VertexID, (Int,Int))] =
+    val vertices: RDD[(VertexId, (Int,Int))] =
       sc.parallelize(0 until rows).flatMap( r => (0 until cols).map( c => (sub2ind(r,c), (r,c)) ) )
     val edges: RDD[Edge[Double]] =
       vertices.flatMap{ case (vid, (r,c)) =>
@@ -211,7 +228,7 @@ object GraphGenerators {
    * being the center vertex.
    */
   def starGraph(sc: SparkContext, nverts: Int): Graph[Int, Int] = {
-    val edges: RDD[(VertexID, VertexID)] = sc.parallelize(1 until nverts).map(vid => (vid, 0))
+    val edges: RDD[(VertexId, VertexId)] = sc.parallelize(1 until nverts).map(vid => (vid, 0))
     Graph.fromEdgeTuples(edges, 1)
   } // end of starGraph
 
diff --git a/graphx/src/test/resources/als-test.data b/graphx/src/test/resources/als-test.data
new file mode 100644
index 0000000000000..e476cc23e047d
--- /dev/null
+++ b/graphx/src/test/resources/als-test.data
@@ -0,0 +1,16 @@
+1,1,5.0
+1,2,1.0
+1,3,5.0
+1,4,1.0
+2,1,5.0
+2,2,1.0
+2,3,5.0
+2,4,1.0
+3,1,1.0
+3,2,5.0
+3,3,1.0
+3,4,5.0
+4,1,1.0
+4,2,5.0
+4,3,1.0
+4,4,5.0
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala
index 280f50e39aa5f..bc2ad5677f806 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 import org.apache.spark.SparkContext
@@ -11,12 +28,12 @@ class GraphOpsSuite extends FunSuite with LocalSparkContext {
   test("joinVertices") {
     withSpark { sc =>
       val vertices =
-        sc.parallelize(Seq[(VertexID, String)]((1, "one"), (2, "two"), (3, "three")), 2)
+        sc.parallelize(Seq[(VertexId, String)]((1, "one"), (2, "two"), (3, "three")), 2)
       val edges = sc.parallelize((Seq(Edge(1, 2, "onetwo"))))
       val g: Graph[String, String] = Graph(vertices, edges)
 
-      val tbl = sc.parallelize(Seq[(VertexID, Int)]((1, 10), (2, 20)))
-      val g1 = g.joinVertices(tbl) { (vid: VertexID, attr: String, u: Int) => attr + u }
+      val tbl = sc.parallelize(Seq[(VertexId, Int)]((1, 10), (2, 20)))
+      val g1 = g.joinVertices(tbl) { (vid: VertexId, attr: String, u: Int) => attr + u }
 
       val v = g1.vertices.collect().toSet
       assert(v === Set((1, "one10"), (2, "two20"), (3, "three")))
@@ -43,7 +60,7 @@ class GraphOpsSuite extends FunSuite with LocalSparkContext {
   test ("filter") {
     withSpark { sc =>
       val n = 5
-      val vertices = sc.parallelize((0 to n).map(x => (x:VertexID, x)))
+      val vertices = sc.parallelize((0 to n).map(x => (x:VertexId, x)))
       val edges = sc.parallelize((1 to n).map(x => Edge(0, x, x)))
       val graph: Graph[Int, Int] = Graph(vertices, edges).cache()
       val filteredGraph = graph.filter(
@@ -51,7 +68,7 @@ class GraphOpsSuite extends FunSuite with LocalSparkContext {
           val degrees: VertexRDD[Int] = graph.outDegrees
           graph.outerJoinVertices(degrees) {(vid, data, deg) => deg.getOrElse(0)}
         },
-        vpred = (vid: VertexID, deg:Int) => deg > 0
+        vpred = (vid: VertexId, deg:Int) => deg > 0
       ).cache()
 
       val v = filteredGraph.vertices.collect().toSet
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala
index 9587f04c3e716..28d34dd9a1a41 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 import org.scalatest.FunSuite
@@ -10,7 +27,7 @@ import org.apache.spark.rdd._
 class GraphSuite extends FunSuite with LocalSparkContext {
 
   def starGraph(sc: SparkContext, n: Int): Graph[String, Int] = {
-    Graph.fromEdgeTuples(sc.parallelize((1 to n).map(x => (0: VertexID, x: VertexID)), 3), "v")
+    Graph.fromEdgeTuples(sc.parallelize((1 to n).map(x => (0: VertexId, x: VertexId)), 3), "v")
   }
 
   test("Graph.fromEdgeTuples") {
@@ -40,7 +57,7 @@ class GraphSuite extends FunSuite with LocalSparkContext {
     withSpark { sc =>
       val rawEdges = (0L to 98L).zip((1L to 99L) :+ 0L)
       val edges: RDD[Edge[Int]] = sc.parallelize(rawEdges).map { case (s, t) => Edge(s, t, 1) }
-      val vertices: RDD[(VertexID, Boolean)] = sc.parallelize((0L until 10L).map(id => (id, true)))
+      val vertices: RDD[(VertexId, Boolean)] = sc.parallelize((0L until 10L).map(id => (id, true)))
       val graph = Graph(vertices, edges, false)
       assert( graph.edges.count() === rawEdges.size )
       // Vertices not explicitly provided but referenced by edges should be created automatically
@@ -57,7 +74,7 @@ class GraphSuite extends FunSuite with LocalSparkContext {
       val n = 5
       val star = starGraph(sc, n)
       assert(star.triplets.map(et => (et.srcId, et.dstId, et.srcAttr, et.dstAttr)).collect.toSet ===
-        (1 to n).map(x => (0: VertexID, x: VertexID, "v", "v")).toSet)
+        (1 to n).map(x => (0: VertexId, x: VertexId, "v", "v")).toSet)
     }
   }
 
@@ -93,7 +110,7 @@ class GraphSuite extends FunSuite with LocalSparkContext {
       val p = 100
       val verts = 1 to n
       val graph = Graph.fromEdgeTuples(sc.parallelize(verts.flatMap(x =>
-        verts.filter(y => y % x == 0).map(y => (x: VertexID, y: VertexID))), p), 0)
+        verts.filter(y => y % x == 0).map(y => (x: VertexId, y: VertexId))), p), 0)
       assert(graph.edges.partitions.length === p)
       val partitionedGraph = graph.partitionBy(EdgePartition2D)
       assert(graph.edges.partitions.length === p)
@@ -119,10 +136,10 @@ class GraphSuite extends FunSuite with LocalSparkContext {
       val star = starGraph(sc, n)
       // mapVertices preserving type
       val mappedVAttrs = star.mapVertices((vid, attr) => attr + "2")
-      assert(mappedVAttrs.vertices.collect.toSet === (0 to n).map(x => (x: VertexID, "v2")).toSet)
+      assert(mappedVAttrs.vertices.collect.toSet === (0 to n).map(x => (x: VertexId, "v2")).toSet)
       // mapVertices changing type
       val mappedVAttrs2 = star.mapVertices((vid, attr) => attr.length)
-      assert(mappedVAttrs2.vertices.collect.toSet === (0 to n).map(x => (x: VertexID, 1)).toSet)
+      assert(mappedVAttrs2.vertices.collect.toSet === (0 to n).map(x => (x: VertexId, 1)).toSet)
     }
   }
 
@@ -151,7 +168,7 @@ class GraphSuite extends FunSuite with LocalSparkContext {
     withSpark { sc =>
       val n = 5
       val star = starGraph(sc, n)
-      assert(star.reverse.outDegrees.collect.toSet === (1 to n).map(x => (x: VertexID, 1)).toSet)
+      assert(star.reverse.outDegrees.collect.toSet === (1 to n).map(x => (x: VertexId, 1)).toSet)
     }
   }
 
@@ -174,7 +191,7 @@ class GraphSuite extends FunSuite with LocalSparkContext {
   test("mask") {
     withSpark { sc =>
       val n = 5
-      val vertices = sc.parallelize((0 to n).map(x => (x:VertexID, x)))
+      val vertices = sc.parallelize((0 to n).map(x => (x:VertexId, x)))
       val edges = sc.parallelize((1 to n).map(x => Edge(0, x, x)))
       val graph: Graph[Int, Int] = Graph(vertices, edges).cache()
 
@@ -201,7 +218,7 @@ class GraphSuite extends FunSuite with LocalSparkContext {
       val star = starGraph(sc, n)
       val doubleStar = Graph.fromEdgeTuples(
         sc.parallelize((1 to n).flatMap(x =>
-          List((0: VertexID, x: VertexID), (0: VertexID, x: VertexID))), 1), "v")
+          List((0: VertexId, x: VertexId), (0: VertexId, x: VertexId))), 1), "v")
       val star2 = doubleStar.groupEdges { (a, b) => a}
       assert(star2.edges.collect.toArray.sorted(Edge.lexicographicOrdering[Int]) ===
         star.edges.collect.toArray.sorted(Edge.lexicographicOrdering[Int]))
@@ -220,7 +237,7 @@ class GraphSuite extends FunSuite with LocalSparkContext {
       assert(neighborDegreeSums.collect().toSet === (0 to n).map(x => (x, n)).toSet)
 
       // activeSetOpt
-      val allPairs = for (x <- 1 to n; y <- 1 to n) yield (x: VertexID, y: VertexID)
+      val allPairs = for (x <- 1 to n; y <- 1 to n) yield (x: VertexId, y: VertexId)
       val complete = Graph.fromEdgeTuples(sc.parallelize(allPairs, 3), 0)
       val vids = complete.mapVertices((vid, attr) => vid).cache()
       val active = vids.vertices.filter { case (vid, attr) => attr % 2 == 0 }
@@ -231,10 +248,10 @@ class GraphSuite extends FunSuite with LocalSparkContext {
         }
         Iterator((et.srcId, 1))
       }, (a: Int, b: Int) => a + b, Some((active, EdgeDirection.In))).collect.toSet
-      assert(numEvenNeighbors === (1 to n).map(x => (x: VertexID, n / 2)).toSet)
+      assert(numEvenNeighbors === (1 to n).map(x => (x: VertexId, n / 2)).toSet)
 
       // outerJoinVertices followed by mapReduceTriplets(activeSetOpt)
-      val ringEdges = sc.parallelize((0 until n).map(x => (x: VertexID, (x+1) % n: VertexID)), 3)
+      val ringEdges = sc.parallelize((0 until n).map(x => (x: VertexId, (x+1) % n: VertexId)), 3)
       val ring = Graph.fromEdgeTuples(ringEdges, 0) .mapVertices((vid, attr) => vid).cache()
       val changed = ring.vertices.filter { case (vid, attr) => attr % 2 == 1 }.mapValues(-_).cache()
       val changedGraph = ring.outerJoinVertices(changed) { (vid, old, newOpt) => newOpt.getOrElse(old) }
@@ -245,7 +262,7 @@ class GraphSuite extends FunSuite with LocalSparkContext {
         }
         Iterator((et.dstId, 1))
       }, (a: Int, b: Int) => a + b, Some(changed, EdgeDirection.Out)).collect.toSet
-      assert(numOddNeighbors === (2 to n by 2).map(x => (x: VertexID, 1)).toSet)
+      assert(numOddNeighbors === (2 to n by 2).map(x => (x: VertexId, 1)).toSet)
 
     }
   }
@@ -260,7 +277,7 @@ class GraphSuite extends FunSuite with LocalSparkContext {
       val neighborDegreeSums = reverseStarDegrees.mapReduceTriplets(
         et => Iterator((et.srcId, et.dstAttr), (et.dstId, et.srcAttr)),
         (a: Int, b: Int) => a + b).collect.toSet
-      assert(neighborDegreeSums === Set((0: VertexID, n)) ++ (1 to n).map(x => (x: VertexID, 0)))
+      assert(neighborDegreeSums === Set((0: VertexId, n)) ++ (1 to n).map(x => (x: VertexId, 0)))
       // outerJoinVertices preserving type
       val messages = reverseStar.vertices.mapValues { (vid, attr) => vid.toString }
       val newReverseStar =
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/LocalSparkContext.scala b/graphx/src/test/scala/org/apache/spark/graphx/LocalSparkContext.scala
index aa9ba840840e0..51f02f94e00d5 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/LocalSparkContext.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/LocalSparkContext.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 import org.scalatest.Suite
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/PregelSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/PregelSuite.scala
index bceff11b8e6c4..490b94429ea1f 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/PregelSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/PregelSuite.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 import org.scalatest.FunSuite
@@ -10,7 +27,7 @@ class PregelSuite extends FunSuite with LocalSparkContext {
   test("1 iteration") {
     withSpark { sc =>
       val n = 5
-      val starEdges = (1 to n).map(x => (0: VertexID, x: VertexID))
+      val starEdges = (1 to n).map(x => (0: VertexId, x: VertexId))
       val star = Graph.fromEdgeTuples(sc.parallelize(starEdges, 3), "v").cache()
       val result = Pregel(star, 0)(
         (vid, attr, msg) => attr,
@@ -24,12 +41,12 @@ class PregelSuite extends FunSuite with LocalSparkContext {
     withSpark { sc =>
       val n = 5
       val chain = Graph.fromEdgeTuples(
-        sc.parallelize((1 until n).map(x => (x: VertexID, x + 1: VertexID)), 3),
+        sc.parallelize((1 until n).map(x => (x: VertexId, x + 1: VertexId)), 3),
         0).cache()
-      assert(chain.vertices.collect.toSet === (1 to n).map(x => (x: VertexID, 0)).toSet)
+      assert(chain.vertices.collect.toSet === (1 to n).map(x => (x: VertexId, 0)).toSet)
       val chainWithSeed = chain.mapVertices { (vid, attr) => if (vid == 1) 1 else 0 }.cache()
       assert(chainWithSeed.vertices.collect.toSet ===
-        Set((1: VertexID, 1)) ++ (2 to n).map(x => (x: VertexID, 0)).toSet)
+        Set((1: VertexId, 1)) ++ (2 to n).map(x => (x: VertexId, 0)).toSet)
       val result = Pregel(chainWithSeed, 0)(
         (vid, attr, msg) => math.max(msg, attr),
         et => if (et.dstAttr != et.srcAttr) Iterator((et.dstId, et.srcAttr)) else Iterator.empty,
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/SerializerSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/SerializerSuite.scala
index 3ba412c1f84f4..e5a582b47ba05 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/SerializerSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/SerializerSuite.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 import java.io.{EOFException, ByteArrayInputStream, ByteArrayOutputStream}
@@ -82,7 +99,7 @@ class SerializerSuite extends FunSuite with LocalSparkContext {
 
   test("IntAggMsgSerializer") {
     val conf = new SparkConf(false)
-    val outMsg = (4: VertexID, 5)
+    val outMsg = (4: VertexId, 5)
     val bout = new ByteArrayOutputStream
     val outStrm = new IntAggMsgSerializer(conf).newInstance().serializeStream(bout)
     outStrm.writeObject(outMsg)
@@ -90,8 +107,8 @@ class SerializerSuite extends FunSuite with LocalSparkContext {
     bout.flush()
     val bin = new ByteArrayInputStream(bout.toByteArray)
     val inStrm = new IntAggMsgSerializer(conf).newInstance().deserializeStream(bin)
-    val inMsg1: (VertexID, Int) = inStrm.readObject()
-    val inMsg2: (VertexID, Int) = inStrm.readObject()
+    val inMsg1: (VertexId, Int) = inStrm.readObject()
+    val inMsg2: (VertexId, Int) = inStrm.readObject()
     assert(outMsg === inMsg1)
     assert(outMsg === inMsg2)
 
@@ -102,7 +119,7 @@ class SerializerSuite extends FunSuite with LocalSparkContext {
 
   test("LongAggMsgSerializer") {
     val conf = new SparkConf(false)
-    val outMsg = (4: VertexID, 1L << 32)
+    val outMsg = (4: VertexId, 1L << 32)
     val bout = new ByteArrayOutputStream
     val outStrm = new LongAggMsgSerializer(conf).newInstance().serializeStream(bout)
     outStrm.writeObject(outMsg)
@@ -110,8 +127,8 @@ class SerializerSuite extends FunSuite with LocalSparkContext {
     bout.flush()
     val bin = new ByteArrayInputStream(bout.toByteArray)
     val inStrm = new LongAggMsgSerializer(conf).newInstance().deserializeStream(bin)
-    val inMsg1: (VertexID, Long) = inStrm.readObject()
-    val inMsg2: (VertexID, Long) = inStrm.readObject()
+    val inMsg1: (VertexId, Long) = inStrm.readObject()
+    val inMsg2: (VertexId, Long) = inStrm.readObject()
     assert(outMsg === inMsg1)
     assert(outMsg === inMsg2)
 
@@ -122,7 +139,7 @@ class SerializerSuite extends FunSuite with LocalSparkContext {
 
   test("DoubleAggMsgSerializer") {
     val conf = new SparkConf(false)
-    val outMsg = (4: VertexID, 5.0)
+    val outMsg = (4: VertexId, 5.0)
     val bout = new ByteArrayOutputStream
     val outStrm = new DoubleAggMsgSerializer(conf).newInstance().serializeStream(bout)
     outStrm.writeObject(outMsg)
@@ -130,8 +147,8 @@ class SerializerSuite extends FunSuite with LocalSparkContext {
     bout.flush()
     val bin = new ByteArrayInputStream(bout.toByteArray)
     val inStrm = new DoubleAggMsgSerializer(conf).newInstance().deserializeStream(bin)
-    val inMsg1: (VertexID, Double) = inStrm.readObject()
-    val inMsg2: (VertexID, Double) = inStrm.readObject()
+    val inMsg1: (VertexId, Double) = inStrm.readObject()
+    val inMsg2: (VertexId, Double) = inStrm.readObject()
     assert(outMsg === inMsg1)
     assert(outMsg === inMsg2)
 
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala
index d94a3aa67c925..cc86bafd2d644 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx
 
 import org.apache.spark.SparkContext
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala
index eb82436f0964c..e135d1d7ad6a3 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.impl
 
 import scala.reflect.ClassTag
@@ -62,7 +79,7 @@ class EdgePartitionSuite extends FunSuite {
   test("innerJoin") {
     def makeEdgePartition[A: ClassTag](xs: Iterable[(Int, Int, A)]): EdgePartition[A] = {
       val builder = new EdgePartitionBuilder[A]
-      for ((src, dst, attr) <- xs) { builder.add(src: VertexID, dst: VertexID, attr) }
+      for ((src, dst, attr) <- xs) { builder.add(src: VertexId, dst: VertexId, attr) }
       builder.toEdgePartition
     }
     val aList = List((0, 1, 0), (1, 0, 0), (1, 2, 0), (5, 4, 0), (5, 5, 0))
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala
index d37d64e8c849e..a048d13fd12b8 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.impl
 
 import org.apache.spark.graphx._
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala
index 27c8705bca2ff..3915be15b3434 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.lib
 
 import org.scalatest.FunSuite
@@ -83,7 +100,7 @@ class ConnectedComponentsSuite extends FunSuite with LocalSparkContext {
   test("Connected Components on a Toy Connected Graph") {
     withSpark { sc =>
       // Create an RDD for the vertices
-      val users: RDD[(VertexID, (String, String))] =
+      val users: RDD[(VertexId, (String, String))] =
         sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
                        (5L, ("franklin", "prof")), (2L, ("istoica", "prof")),
                        (4L, ("peter", "student"))))
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala
index fe7e4261f8d03..fc491ae327c2a 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.lib
 
 import org.scalatest.FunSuite
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala
index e173c652a53b6..e01df56e94de9 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala
@@ -1,12 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.lib
 
 import org.scalatest.FunSuite
 
-import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
 import org.apache.spark.graphx._
-import org.apache.spark.graphx.util.GraphGenerators
-import org.apache.spark.rdd._
 
 
 class SVDPlusPlusSuite extends FunSuite with LocalSparkContext {
@@ -14,16 +27,16 @@ class SVDPlusPlusSuite extends FunSuite with LocalSparkContext {
   test("Test SVD++ with mean square error on training set") {
     withSpark { sc =>
       val svdppErr = 8.0
-      val edges = sc.textFile("mllib/data/als/test.data").map { line =>
+      val edges = sc.textFile(getClass.getResource("/als-test.data").getFile).map { line =>
         val fields = line.split(",")
         Edge(fields(0).toLong * 2, fields(1).toLong * 2 + 1, fields(2).toDouble)
       }
       val conf = new SVDPlusPlus.Conf(10, 2, 0.0, 5.0, 0.007, 0.007, 0.005, 0.015) // 2 iterations
       var (graph, u) = SVDPlusPlus.run(edges, conf)
       graph.cache()
-      val err = graph.vertices.collect.map{ case (vid, vd) =>
+      val err = graph.vertices.collect().map{ case (vid, vd) =>
         if (vid % 2 == 1) vd._4 else 0.0
-      }.reduce(_ + _) / graph.triplets.collect.size
+      }.reduce(_ + _) / graph.triplets.collect().size
       assert(err <= svdppErr)
     }
   }
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/StronglyConnectedComponentsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/StronglyConnectedComponentsSuite.scala
index 0458311661452..df54aa37cad68 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/StronglyConnectedComponentsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/StronglyConnectedComponentsSuite.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.lib
 
 import org.scalatest.FunSuite
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala
index 3452ce9764752..293c7f3ba4c21 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.lib
 
 import org.scalatest.FunSuite
diff --git a/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala
index 11db339750920..f3b3738db0dad 100644
--- a/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala
+++ b/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.graphx.util
 
 import org.scalatest.FunSuite
diff --git a/mllib/pom.xml b/mllib/pom.xml
index dda3900afebdf..41600c4c4b561 100644
--- a/mllib/pom.xml
+++ b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.1-incubating-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 3fec1a909dfb9..efc0eb935376b 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -24,7 +24,6 @@ import org.apache.spark.mllib.recommendation._
 import org.apache.spark.rdd.RDD
 import java.nio.ByteBuffer
 import java.nio.ByteOrder
-import java.nio.DoubleBuffer
 
 /**
  * The Java stubs necessary for the Python mllib bindings.
@@ -81,7 +80,6 @@ class PythonMLLibAPI extends Serializable {
     }
     val db = bb.asDoubleBuffer()
     val ans = new Array[Array[Double]](rows.toInt)
-    var i = 0
     for (i <- 0 until rows.toInt) {
       ans(i) = new Array[Double](cols.toInt)
       db.get(ans(i))
@@ -236,7 +234,7 @@ class PythonMLLibAPI extends Serializable {
     * Serialize a Rating object into an array of bytes.
     * It can be deserialized using RatingDeserializer().
     *
-    * @param rate
+    * @param rate the Rating object to serialize
     * @return
     */
   private[spark] def serializeRating(rate: Rating): Array[Byte] = {
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
index f2964ea446ec8..6dff29dfb45cc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.mllib.classification
 
-import scala.math.signum
-
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.optimization._
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
index cfc81c985aa64..980be931576dc 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -19,8 +19,6 @@ package org.apache.spark.mllib.clustering
 
 import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkContext._
-import org.apache.spark.mllib.util.MLUtils
-
 
 /**
  * A clustering model for K-means. Each point belongs to the cluster with the closest center.
@@ -39,6 +37,6 @@ class KMeansModel(val clusterCenters: Array[Array[Double]]) extends Serializable
    * model on the given data.
    */
   def computeCost(data: RDD[Array[Double]]): Double = {
-    data.map(p => KMeans.pointCost(clusterCenters, p)).sum
+    data.map(p => KMeans.pointCost(clusterCenters, p)).sum()
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala
index 4c51f4f881f76..37124f261eeb9 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala
@@ -86,13 +86,17 @@ class L1Updater extends Updater {
 
 /**
  * Updater that adjusts the learning rate and performs L2 regularization
+ *
+ * See, for example, explanation of gradient and loss with L2 regularization on slide 21-22
+ * of <a href="http://people.cs.umass.edu/~sheldon/teaching/2012fa/ml/files/lec7-annotated.pdf">
+ * these slides</a>.
  */
 class SquaredL2Updater extends Updater {
   override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix,
       stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = {
     val thisIterStepSize = stepSize / math.sqrt(iter)
     val normGradient = gradient.mul(thisIterStepSize)
-    val newWeights = weightsOld.sub(normGradient).div(2.0 * thisIterStepSize * regParam + 1.0)
+    val newWeights = weightsOld.mul(1.0 - 2.0 * thisIterStepSize * regParam).sub(normGradient)
     (newWeights, pow(newWeights.norm2, 2.0) * regParam)
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
index 89ee07063dd89..3e93402adffaf 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala
@@ -209,8 +209,8 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l
   def computeYtY(factors: RDD[(Int, Array[Array[Double]])]) = {
     if (implicitPrefs) {
       Option(
-        factors.flatMapValues{ case factorArray =>
-          factorArray.map{ vector =>
+        factors.flatMapValues { case factorArray =>
+          factorArray.view.map { vector =>
             val x = new DoubleMatrix(vector)
             x.mmul(x.transpose())
           }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
index fe5cce064bac7..df599fde76a86 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.regression
 
-import org.apache.spark.{Logging, SparkContext}
+import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.optimization._
 import org.apache.spark.mllib.util.MLUtils
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
index c125c6797ada3..0c0e67fb7b123 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.regression
 
-import org.apache.spark.{Logging, SparkContext}
+import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.optimization._
 import org.apache.spark.mllib.util.MLUtils
@@ -76,7 +76,7 @@ class RidgeRegressionWithSGD private (
   def createModel(weights: Array[Double], intercept: Double) = {
     val weightsMat = new DoubleMatrix(weights.length + 1, 1, (Array(intercept) ++ weights):_*)
     val weightsScaled = weightsMat.div(xColSd)
-    val interceptScaled = yMean - (weightsMat.transpose().mmul(xColMean.div(xColSd)).get(0))
+    val interceptScaled = yMean - weightsMat.transpose().mmul(xColMean.div(xColSd)).get(0)
 
     new RidgeRegressionModel(weightsScaled.data, interceptScaled)
   }
@@ -86,7 +86,7 @@ class RidgeRegressionWithSGD private (
       initialWeights: Array[Double])
     : RidgeRegressionModel =
   {
-    val nfeatures: Int = input.first.features.length
+    val nfeatures: Int = input.first().features.length
     val nexamples: Long = input.count()
 
     // To avoid penalizing the intercept, we center and scale the data.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
index bc5045fb05d3b..2e03684e62861 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala
@@ -25,7 +25,6 @@ import org.jblas.DoubleMatrix
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.regression.LabeledPoint
 
 /**
  * Generate sample data used for Linear Data. This class generates
@@ -73,7 +72,7 @@ object LinearDataGenerator {
     val x = Array.fill[Array[Double]](nPoints)(
       Array.fill[Double](weights.length)(2 * rnd.nextDouble - 1.0))
     val y = x.map { xi =>
-      (new DoubleMatrix(1, xi.length, xi:_*)).dot(weightsMat) + intercept + eps * rnd.nextGaussian()
+      new DoubleMatrix(1, xi.length, xi: _*).dot(weightsMat) + intercept + eps * rnd.nextGaussian()
     }
     y.zip(x).map(p => LabeledPoint(p._1, p._2))
   }
@@ -86,7 +85,6 @@ object LinearDataGenerator {
    * @param nexamples Number of examples that will be contained in the RDD.
    * @param nfeatures Number of features to generate for each example.
    * @param eps Epsilon factor by which examples are scaled.
-   * @param weights Weights associated with the first weights.length features.
    * @param nparts Number of partitions in the RDD. Default value is 2.
    *
    * @return RDD of LabeledPoint containing sample data.
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
index d5f3f6b8dbeea..348aba1dea5b6 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-package org.apache.spark.mllib.recommendation
+package org.apache.spark.mllib.util
 
 import scala.util.Random
 
@@ -23,7 +23,6 @@ import org.jblas.DoubleMatrix
 
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
-import org.apache.spark.mllib.util.MLUtils
 
 /**
 * Generate RDD(s) containing data for Matrix Factorization.
@@ -31,9 +30,9 @@ import org.apache.spark.mllib.util.MLUtils
 * This method samples training entries according to the oversampling factor
 * 'trainSampFact', which is a multiplicative factor of the number of
 * degrees of freedom of the matrix: rank*(m+n-rank).
-* 
-* It optionally samples entries for a testing matrix using 
-* 'testSampFact', the percentage of the number of training entries 
+*
+* It optionally samples entries for a testing matrix using
+* 'testSampFact', the percentage of the number of training entries
 * to use for testing.
 *
 * This method takes the following inputs:
@@ -73,7 +72,7 @@ object MFDataGenerator{
 
     val A = DoubleMatrix.randn(m, rank)
     val B = DoubleMatrix.randn(rank, n)
-    val z = 1 / (scala.math.sqrt(scala.math.sqrt(rank)))
+    val z = 1 / scala.math.sqrt(scala.math.sqrt(rank))
     A.mmuli(z)
     B.mmuli(z)
     val fullData = A.mmul(B)
@@ -91,7 +90,7 @@ object MFDataGenerator{
       .map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1)))
 
     // optionally add gaussian noise
-    if (noise) { 
+    if (noise) {
       trainData.map(x => (x._1, x._2, x._3 + rand.nextGaussian * sigma))
     }
 
@@ -107,8 +106,8 @@ object MFDataGenerator{
         .map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1)))
       testData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath)
     }
-        
+
     sc.stop()
-  
+
   }
 }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
index d91b74c3ac2b3..64c6136a8b89d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -97,7 +97,7 @@ object MLUtils {
     while (col < nfeatures) {
       xColMean.put(col, xColSumsMap(col)._1 / nexamples)
       val variance =
-        (xColSumsMap(col)._2 - (math.pow(xColSumsMap(col)._1, 2) / nexamples)) / (nexamples)
+        (xColSumsMap(col)._2 - (math.pow(xColSumsMap(col)._1, 2) / nexamples)) / nexamples
       xColSd.put(col, math.sqrt(variance))
       col += 1
     }
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
index 07022093f300c..c96c94f70eef7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala
@@ -56,7 +56,7 @@ object SVMDataGenerator {
       val x = Array.fill[Double](nfeatures) {
         rnd.nextDouble() * 2.0 - 1.0
       }
-      val yD = (new DoubleMatrix(1, x.length, x:_*)).dot(trueWeights) + rnd.nextGaussian() * 0.1
+      val yD = new DoubleMatrix(1, x.length, x: _*).dot(trueWeights) + rnd.nextGaussian() * 0.1
       val y = if (yD < 0) 0.0 else 1.0
       LabeledPoint(y, x)
     }
diff --git a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java
index 23ea3548b95b6..073ded6f36933 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.mllib.classification;
 
 import org.apache.spark.api.java.JavaRDD;
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
index 34c67294e9ac9..02ede711372d3 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala
@@ -80,9 +80,9 @@ class LogisticRegressionSuite extends FunSuite with BeforeAndAfterAll with Shoul
   }
 
   def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) {
-    val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) =>
-      (prediction != expected.label)
-    }.size
+    val numOffPredictions = predictions.zip(input).count { case (prediction, expected) =>
+      prediction != expected.label
+    }
     // At least 83% of the predictions should be on.
     ((input.length - numOffPredictions).toDouble / input.length) should be > 0.83
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
index 6a957e3ddca71..3357b86f9b706 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala
@@ -18,7 +18,6 @@
 package org.apache.spark.mllib.classification
 
 import scala.util.Random
-import scala.math.signum
 import scala.collection.JavaConversions._
 
 import org.scalatest.BeforeAndAfterAll
@@ -50,7 +49,7 @@ object SVMSuite {
     val x = Array.fill[Array[Double]](nPoints)(
         Array.fill[Double](weights.length)(rnd.nextDouble() * 2.0 - 1.0))
     val y = x.map { xi =>
-      val yD = (new DoubleMatrix(1, xi.length, xi:_*)).dot(weightsMat) +
+      val yD = new DoubleMatrix(1, xi.length, xi: _*).dot(weightsMat) +
         intercept + 0.01 * rnd.nextGaussian()
       if (yD < 0) 0.0 else 1.0
     }
@@ -72,9 +71,9 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll {
   }
 
   def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) {
-    val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) =>
-      (prediction != expected.label)
-    }.size
+    val numOffPredictions = predictions.zip(input).count { case (prediction, expected) =>
+      prediction != expected.label
+    }
     // At least 80% of the predictions should be on.
     assert(numOffPredictions < input.length / 5)
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
index 94245f6027b30..73657cac893ce 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala
@@ -17,15 +17,12 @@
 
 package org.apache.spark.mllib.clustering
 
-import scala.util.Random
 
 import org.scalatest.BeforeAndAfterAll
 import org.scalatest.FunSuite
 
 import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
 
-import org.jblas._
 
 class KMeansSuite extends FunSuite with BeforeAndAfterAll {
   @transient private var sc: SparkContext = _
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
index e683a90f57aba..4e8dbde65801c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala
@@ -24,7 +24,6 @@ import org.scalatest.BeforeAndAfterAll
 import org.scalatest.FunSuite
 
 import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
 
 import org.jblas._
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala
index db980c7bae64f..b2c8df97a82a7 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala
@@ -17,8 +17,6 @@
 
 package org.apache.spark.mllib.regression
 
-import scala.collection.JavaConversions._
-import scala.util.Random
 
 import org.scalatest.BeforeAndAfterAll
 import org.scalatest.FunSuite
@@ -41,10 +39,10 @@ class LassoSuite extends FunSuite with BeforeAndAfterAll {
   }
 
   def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) {
-    val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) =>
+    val numOffPredictions = predictions.zip(input).count { case (prediction, expected) =>
       // A prediction is off if the prediction is more than 0.5 away from expected value.
       math.abs(prediction - expected.label) > 0.5
-    }.size
+    }
     // At least 80% of the predictions should be on.
     assert(numOffPredictions < input.length / 5)
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala
index ef500c704c8a9..406afbaa3e2c1 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala
@@ -21,7 +21,6 @@ import org.scalatest.BeforeAndAfterAll
 import org.scalatest.FunSuite
 
 import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.util.LinearDataGenerator
 
 class LinearRegressionSuite extends FunSuite with BeforeAndAfterAll {
@@ -37,10 +36,10 @@ class LinearRegressionSuite extends FunSuite with BeforeAndAfterAll {
   }
 
   def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) {
-    val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) =>
+    val numOffPredictions = predictions.zip(input).count { case (prediction, expected) =>
       // A prediction is off if the prediction is more than 0.5 away from expected value.
       math.abs(prediction - expected.label) > 0.5
-    }.size
+    }
     // At least 80% of the predictions should be on.
     assert(numOffPredictions < input.length / 5)
   }
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala
index c18092d804fa3..1d6a10b66e892 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala
@@ -17,15 +17,12 @@
 
 package org.apache.spark.mllib.regression
 
-import scala.collection.JavaConversions._
-import scala.util.Random
 
 import org.jblas.DoubleMatrix
 import org.scalatest.BeforeAndAfterAll
 import org.scalatest.FunSuite
 
 import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
 import org.apache.spark.mllib.util.LinearDataGenerator
 
 class RidgeRegressionSuite extends FunSuite with BeforeAndAfterAll {
diff --git a/pom.xml b/pom.xml
index b25d9d7ef891d..e53c930ad0aa4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -25,7 +25,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent</artifactId>
-  <version>0.9.0-incubating-SNAPSHOT</version>
+  <version>0.9.1-incubating-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.incubator.apache.org/</url>
@@ -389,7 +389,7 @@
       <dependency>
         <groupId>com.novocode</groupId>
         <artifactId>junit-interface</artifactId>
-        <version>0.9</version>
+        <version>0.10</version>
         <scope>test</scope>
       </dependency>
       <dependency>
@@ -727,6 +727,14 @@
   </build>
 
   <profiles>
+     <profile>
+      <!-- Without this the release build incorrectly includes the 
+          test-jar dependency and it breaks. -->
+      <id>release</id>
+      <properties>
+        <maven.test.skip>true</maven.test.skip>
+      </properties>
+    </profile>
     <profile>
       <id>yarn-alpha</id>
       <properties>
@@ -735,11 +743,9 @@
         <hadoop.version>0.23.7</hadoop.version>
         <!--<hadoop.version>2.0.5-alpha</hadoop.version> -->
       </properties>
-
       <modules>
         <module>yarn</module>
       </modules>
-
     </profile>
 
     <profile>
@@ -752,17 +758,15 @@
       <modules>
         <module>yarn</module>
       </modules>
-
     </profile>
-
+    
+    <!-- Ganglia integration is not included by default due to LGPL-licensed code -->
     <profile>
-      <id>repl-bin</id>
-      <activation>
-        <activeByDefault>false</activeByDefault>
-      </activation>
+      <id>spark-ganglia-lgpl</id>
       <modules>
-        <module>repl-bin</module>
+        <module>extras/spark-ganglia-lgpl</module>
       </modules>
     </profile>
+
   </profiles>
 </project>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index a9f9937cb168c..f9eeeb0e2821d 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -61,10 +61,16 @@ object SparkBuild extends Build {
   lazy val mllib = Project("mllib", file("mllib"), settings = mllibSettings) dependsOn(core)
 
   lazy val assemblyProj = Project("assembly", file("assembly"), settings = assemblyProjSettings)
-    .dependsOn(core, graphx, bagel, mllib, repl, streaming) dependsOn(maybeYarn: _*)
+    .dependsOn(core, graphx, bagel, mllib, repl, streaming) dependsOn(maybeYarn: _*) dependsOn(maybeGanglia: _*)
 
   lazy val assembleDeps = TaskKey[Unit]("assemble-deps", "Build assembly of dependencies and packages Spark projects")
 
+  // A dummy command so we can run the Jenkins pull request builder for older versions of Spark.
+  lazy val scalastyle = TaskKey[Unit]("scalastyle", "Dummy scalastyle check")
+  val scalastyleTask = scalastyle := {
+    println("scalastyle is not configured for this version of Spark project.")
+  }
+
   // A configuration to set an alternative publishLocalConfiguration
   lazy val MavenCompile = config("m2r") extend(Compile)
   lazy val publishLocalBoth = TaskKey[Unit]("publish-local", "publish local for m2 and ivy")
@@ -84,13 +90,21 @@ object SparkBuild extends Build {
     case None => DEFAULT_YARN
     case Some(v) => v.toBoolean
   }
-
-  // Conditionally include the yarn sub-project
+  lazy val hadoopClient = if (hadoopVersion.startsWith("0.20.") || hadoopVersion == "1.0.0") "hadoop-core" else "hadoop-client"
+  
+  // Include Ganglia integration if the user has enabled Ganglia
+  // This is isolated from the normal build due to LGPL-licensed code in the library
+  lazy val isGangliaEnabled = Properties.envOrNone("SPARK_GANGLIA_LGPL").isDefined
+  lazy val gangliaProj = Project("spark-ganglia-lgpl", file("extras/spark-ganglia-lgpl"), settings = gangliaSettings).dependsOn(core)
+  val maybeGanglia: Seq[ClasspathDependency] = if (isGangliaEnabled) Seq(gangliaProj) else Seq()
+  val maybeGangliaRef: Seq[ProjectReference] = if (isGangliaEnabled) Seq(gangliaProj) else Seq()
+
+  // Include the YARN project if the user has enabled YARN
   lazy val yarnAlpha = Project("yarn-alpha", file("yarn/alpha"), settings = yarnAlphaSettings) dependsOn(core)
   lazy val yarn = Project("yarn", file("yarn/stable"), settings = yarnSettings) dependsOn(core)
 
-  lazy val maybeYarn = if (isYarnEnabled) Seq[ClasspathDependency](if (isNewHadoop) yarn else yarnAlpha) else Seq[ClasspathDependency]()
-  lazy val maybeYarnRef = if (isYarnEnabled) Seq[ProjectReference](if (isNewHadoop) yarn else yarnAlpha) else Seq[ProjectReference]()
+  lazy val maybeYarn: Seq[ClasspathDependency] = if (isYarnEnabled) Seq(if (isNewHadoop) yarn else yarnAlpha) else Seq()
+  lazy val maybeYarnRef: Seq[ProjectReference] = if (isYarnEnabled) Seq(if (isNewHadoop) yarn else yarnAlpha) else Seq()
 
   lazy val externalTwitter = Project("external-twitter", file("external/twitter"), settings = twitterSettings)
     .dependsOn(streaming % "compile->compile;test->test")
@@ -114,13 +128,13 @@ object SparkBuild extends Build {
     .dependsOn(core, mllib, graphx, bagel, streaming, externalTwitter) dependsOn(allExternal: _*)
 
   // Everything except assembly, tools and examples belong to packageProjects
-  lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx) ++ maybeYarnRef
+  lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx) ++ maybeYarnRef ++ maybeGangliaRef
 
   lazy val allProjects = packageProjects ++ allExternalRefs ++ Seq[ProjectReference](examples, tools, assemblyProj)
 
   def sharedSettings = Defaults.defaultSettings ++ Seq(
     organization       := "org.apache.spark",
-    version            := "0.9.0-incubating-SNAPSHOT",
+    version            := "0.9.0-incubating",
     scalaVersion       := "2.10.3",
     scalacOptions := Seq("-Xmax-classfile-name", "120", "-unchecked", "-deprecation",
       "-target:" + SCALAC_JVM_VERSION),
@@ -212,12 +226,13 @@ object SparkBuild extends Build {
         "org.eclipse.jetty.orbit" % "javax.servlet" % "2.5.0.v201103041518" artifacts Artifact("javax.servlet", "jar", "jar"),
         "org.scalatest"    %% "scalatest"       % "1.9.1"  % "test",
         "org.scalacheck"   %% "scalacheck"      % "1.10.0" % "test",
-        "com.novocode"      % "junit-interface" % "0.9"    % "test",
+        "com.novocode"      % "junit-interface" % "0.10"   % "test",
         "org.easymock"      % "easymock"        % "3.1"    % "test",
         "org.mockito"       % "mockito-all"     % "1.8.5"  % "test",
         "commons-io"        % "commons-io"      % "2.4"    % "test"
     ),
 
+    testOptions += Tests.Argument(TestFrameworks.JUnit, "-v", "-a"),
     parallelExecution := true,
     /* Workaround for issue #206 (fixed after SBT 0.11.0) */
     watchTransitiveSources <<= Defaults.inDependencies[Task[Seq[File]]](watchSources.task,
@@ -266,18 +281,16 @@ object SparkBuild extends Build {
         "org.apache.mesos"         % "mesos"            % "0.13.0",
         "net.java.dev.jets3t"      % "jets3t"           % "0.7.1",
         "org.apache.derby"         % "derby"            % "10.4.2.0"                     % "test",
-        "org.apache.hadoop"        % "hadoop-client"    % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm, excludeCglib),
+        "org.apache.hadoop"        % hadoopClient       % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm, excludeCglib),
         "org.apache.avro"          % "avro"             % "1.7.4",
         "org.apache.avro"          % "avro-ipc"         % "1.7.4" excludeAll(excludeNetty),
         "org.apache.zookeeper"     % "zookeeper"        % "3.4.5" excludeAll(excludeNetty),
         "com.codahale.metrics"     % "metrics-core"     % "3.0.0",
         "com.codahale.metrics"     % "metrics-jvm"      % "3.0.0",
         "com.codahale.metrics"     % "metrics-json"     % "3.0.0",
-        "com.codahale.metrics"     % "metrics-ganglia"  % "3.0.0",
         "com.codahale.metrics"     % "metrics-graphite" % "3.0.0",
         "com.twitter"             %% "chill"            % "0.3.1",
         "com.twitter"              % "chill-java"       % "0.3.1",
-        "com.typesafe"             % "config"           % "1.0.2",
         "com.clearspring.analytics" % "stream"          % "2.5.1"
       )
   )
@@ -317,7 +330,10 @@ object SparkBuild extends Build {
   ) ++ assemblySettings ++ extraAssemblySettings
 
   def graphxSettings = sharedSettings ++ Seq(
-    name := "spark-graphx"
+    name := "spark-graphx",
+    libraryDependencies ++= Seq(
+      "org.apache.commons" % "commons-math3" % "3.2"
+    )
   )
 
   def bagelSettings = sharedSettings ++ Seq(
@@ -361,6 +377,11 @@ object SparkBuild extends Build {
     name := "spark-yarn"
   )
 
+  def gangliaSettings = sharedSettings ++ Seq(
+    name := "spark-ganglia-lgpl",
+    libraryDependencies += "com.codahale.metrics" % "metrics-ganglia" % "3.0.0"
+  )
+
   // Conditionally include the YARN dependencies because some tools look at all sub-projects and will complain
   // if we refer to nonexistent dependencies (e.g. hadoop-yarn-api from a Hadoop version without YARN).
   def extraYarnSettings = if(isYarnEnabled) yarnEnabledSettings else Seq()
@@ -368,7 +389,7 @@ object SparkBuild extends Build {
   def yarnEnabledSettings = Seq(
     libraryDependencies ++= Seq(
       // Exclude rule required for all ?
-      "org.apache.hadoop" % "hadoop-client"      % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm, excludeCglib),
+      "org.apache.hadoop" % hadoopClient         % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm, excludeCglib),
       "org.apache.hadoop" % "hadoop-yarn-api"    % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm, excludeCglib),
       "org.apache.hadoop" % "hadoop-yarn-common" % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm, excludeCglib),
       "org.apache.hadoop" % "hadoop-yarn-client" % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm, excludeCglib)
@@ -378,6 +399,7 @@ object SparkBuild extends Build {
   def assemblyProjSettings = sharedSettings ++ Seq(
     libraryDependencies += "net.sf.py4j" % "py4j" % "0.8.1",
     name := "spark-assembly",
+    scalastyleTask,
     assembleDeps in Compile <<= (packageProjects.map(packageBin in Compile in _) ++ Seq(packageDependency in Compile)).dependOn,
     jarName in assembly <<= version map { v => "spark-assembly-" + v + "-hadoop" + hadoopVersion + ".jar" },
     jarName in packageDependency <<= version map { v => "spark-assembly-" + v + "-hadoop" + hadoopVersion + "-deps.jar" }
diff --git a/project/project/SparkPluginBuild.scala b/project/project/SparkPluginBuild.scala
index 6a66bd1d06b23..4853be2617684 100644
--- a/project/project/SparkPluginBuild.scala
+++ b/project/project/SparkPluginBuild.scala
@@ -20,5 +20,5 @@ import sbt._
 object SparkPluginDef extends Build {
   lazy val root = Project("plugins", file(".")) dependsOn(junitXmlListener)
   /* This is not published in a Maven repository, so we get it from GitHub directly */
-  lazy val junitXmlListener = uri("git://github.com/ijuma/junit_xml_listener.git#fe434773255b451a38e8d889536ebc260f4225ce")
+  lazy val junitXmlListener = uri("https://github.com/ijuma/junit_xml_listener.git#fe434773255b451a38e8d889536ebc260f4225ce")
 }
diff --git a/repl-bin/src/deb/bin/spark-executor b/python/examples/sort.py
similarity index 54%
rename from repl-bin/src/deb/bin/spark-executor
rename to python/examples/sort.py
index 052d76fb8d81c..5de20a6d98f43 100755
--- a/repl-bin/src/deb/bin/spark-executor
+++ b/python/examples/sort.py
@@ -1,5 +1,3 @@
-#!/usr/bin/env bash
-
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -17,6 +15,22 @@
 # limitations under the License.
 #
 
-FWDIR="$(cd `dirname $0`; pwd)"
-echo "Running spark-executor with framework dir = $FWDIR"
-exec $FWDIR/run org.apache.spark.executor.MesosExecutorBackend
+import sys
+
+from pyspark import SparkContext
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print >> sys.stderr, "Usage: sort <master> <file>"
+        exit(-1)
+    sc = SparkContext(sys.argv[1], "PythonSort")
+    lines = sc.textFile(sys.argv[2], 1)
+    sortedCount = lines.flatMap(lambda x: x.split(' ')) \
+                  .map(lambda x: (int(x), 1)) \
+                  .sortByKey(lambda x: x)
+    # This is just a demo on how to bring all the sorted data back to a single node.
+    # In reality, we wouldn't want to collect all the data to the driver node.
+    output = sortedCount.collect()
+    for (num, unitcount) in output:
+        print num
diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py
index d72aed6a30ec1..3870cd8f2b097 100644
--- a/python/pyspark/conf.py
+++ b/python/pyspark/conf.py
@@ -61,14 +61,12 @@ class SparkConf(object):
 
     Most of the time, you would create a SparkConf object with
     C{SparkConf()}, which will load values from C{spark.*} Java system
-    properties and any C{spark.conf} on your Spark classpath. In this
-    case, system properties take priority over C{spark.conf}, and any
-    parameters you set directly on the C{SparkConf} object take priority
-    over both of those.
+    properties as well. In this case, any parameters you set directly on
+    the C{SparkConf} object take priority over system properties.
 
     For unit tests, you can also call C{SparkConf(false)} to skip
     loading external settings and get the same configuration no matter
-    what is on the classpath.
+    what the system properties are.
 
     All setter methods in this class support chaining. For example,
     you can write C{conf.setMaster("local").setAppName("My app")}.
@@ -82,7 +80,7 @@ def __init__(self, loadDefaults=True, _jvm=None):
         Create a new Spark configuration.
 
         @param loadDefaults: whether to load values from Java system
-               properties and classpath (True by default)
+               properties (True by default)
         @param _jvm: internal parameter used to pass a handle to the
                Java VM; does not need to be set by users
         """
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index f955aad7a4f12..f318b5d9a73d7 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -27,7 +27,7 @@
 from pyspark.conf import SparkConf
 from pyspark.files import SparkFiles
 from pyspark.java_gateway import launch_gateway
-from pyspark.serializers import PickleSerializer, BatchedSerializer, MUTF8Deserializer
+from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer
 from pyspark.storagelevel import StorageLevel
 from pyspark.rdd import RDD
 
@@ -234,7 +234,7 @@ def textFile(self, name, minSplits=None):
         """
         minSplits = minSplits or min(self.defaultParallelism, 2)
         return RDD(self._jsc.textFile(name, minSplits), self,
-                   MUTF8Deserializer())
+                   UTF8Deserializer())
 
     def _checkpointFile(self, name, input_deserializer):
         jrdd = self._jsc.checkpointFile(name)
diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py
index b1a5df109b46e..b420d7a7f23ba 100644
--- a/python/pyspark/mllib/__init__.py
+++ b/python/pyspark/mllib/__init__.py
@@ -18,3 +18,13 @@
 """
 Python bindings for MLlib.
 """
+
+# MLlib currently needs Python 2.7+ and NumPy 1.7+, so complain if lower
+
+import sys
+if sys.version_info[0:2] < (2, 7):
+    raise Exception("MLlib requires Python 2.7+")
+
+import numpy
+if numpy.version.version < '1.7':
+    raise Exception("MLlib requires NumPy 1.7+")
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
index 6fb4a7b3be25d..678b005414c0d 100644
--- a/python/pyspark/rdd.py
+++ b/python/pyspark/rdd.py
@@ -27,6 +27,8 @@
 from subprocess import Popen, PIPE
 from tempfile import NamedTemporaryFile
 from threading import Thread
+import warnings
+from heapq import heappush, heappop, heappushpop
 
 from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \
     BatchedSerializer, CloudPickleSerializer, pack_long
@@ -162,7 +164,7 @@ def getCheckpointFile(self):
 
     def map(self, f, preservesPartitioning=False):
         """
-        Return a new RDD containing the distinct elements in this RDD.
+        Return a new RDD by applying a function to each element of this RDD.
         """
         def func(split, iterator): return imap(f, iterator)
         return PipelinedRDD(self, func, preservesPartitioning)
@@ -179,7 +181,7 @@ def flatMap(self, f, preservesPartitioning=False):
         [(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)]
         """
         def func(s, iterator): return chain.from_iterable(imap(f, iterator))
-        return self.mapPartitionsWithSplit(func, preservesPartitioning)
+        return self.mapPartitionsWithIndex(func, preservesPartitioning)
 
     def mapPartitions(self, f, preservesPartitioning=False):
         """
@@ -191,10 +193,24 @@ def mapPartitions(self, f, preservesPartitioning=False):
         [3, 7]
         """
         def func(s, iterator): return f(iterator)
-        return self.mapPartitionsWithSplit(func)
+        return self.mapPartitionsWithIndex(func)
+
+    def mapPartitionsWithIndex(self, f, preservesPartitioning=False):
+        """
+        Return a new RDD by applying a function to each partition of this RDD,
+        while tracking the index of the original partition.
+
+        >>> rdd = sc.parallelize([1, 2, 3, 4], 4)
+        >>> def f(splitIndex, iterator): yield splitIndex
+        >>> rdd.mapPartitionsWithIndex(f).sum()
+        6
+        """
+        return PipelinedRDD(self, f, preservesPartitioning)
 
     def mapPartitionsWithSplit(self, f, preservesPartitioning=False):
         """
+        Deprecated: use mapPartitionsWithIndex instead.
+
         Return a new RDD by applying a function to each partition of this RDD,
         while tracking the index of the original partition.
 
@@ -203,7 +219,9 @@ def mapPartitionsWithSplit(self, f, preservesPartitioning=False):
         >>> rdd.mapPartitionsWithSplit(f).sum()
         6
         """
-        return PipelinedRDD(self, f, preservesPartitioning)
+        warnings.warn("mapPartitionsWithSplit is deprecated; "
+            "use mapPartitionsWithIndex instead", DeprecationWarning, stacklevel=2)
+        return self.mapPartitionsWithIndex(f, preservesPartitioning)
 
     def filter(self, f):
         """
@@ -235,7 +253,7 @@ def sample(self, withReplacement, fraction, seed):
         >>> sc.parallelize(range(0, 100)).sample(False, 0.1, 2).collect() #doctest: +SKIP
         [2, 3, 20, 21, 24, 41, 42, 66, 67, 89, 90, 98]
         """
-        return self.mapPartitionsWithSplit(RDDSampler(withReplacement, fraction, seed).func, True)
+        return self.mapPartitionsWithIndex(RDDSampler(withReplacement, fraction, seed).func, True)
 
     # this is ported from scala/spark/RDD.scala
     def takeSample(self, withReplacement, num, seed):
@@ -599,6 +617,30 @@ def mergeMaps(m1, m2):
                 m1[k] += v
             return m1
         return self.mapPartitions(countPartition).reduce(mergeMaps)
+    
+    def top(self, num):
+        """
+        Get the top N elements from a RDD.
+
+        Note: It returns the list sorted in ascending order.
+        >>> sc.parallelize([10, 4, 2, 12, 3]).top(1)
+        [12]
+        >>> sc.parallelize([2, 3, 4, 5, 6]).cache().top(2)
+        [5, 6]
+        """
+        def topIterator(iterator):
+            q = []
+            for k in iterator:
+                if len(q) < num:
+                    heappush(q, k)
+                else:
+                    heappushpop(q, k)
+            yield q
+
+        def merge(a, b):
+            return next(topIterator(a + b))
+
+        return sorted(self.mapPartitions(topIterator).reduce(merge))
 
     def take(self, num):
         """
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 2a500ab919bea..8c6ad79059c23 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -261,13 +261,13 @@ class MarshalSerializer(FramedSerializer):
     loads = marshal.loads
 
 
-class MUTF8Deserializer(Serializer):
+class UTF8Deserializer(Serializer):
     """
-    Deserializes streams written by Java's DataOutputStream.writeUTF().
+    Deserializes streams written by getBytes.
     """
 
     def loads(self, stream):
-        length = struct.unpack('>H', stream.read(2))[0]
+        length = read_int(stream)
         return stream.read(length).decode('utf8')
 
     def load_stream(self, stream):
diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py
index 1602227a273e7..920334205c13e 100644
--- a/python/pyspark/shell.py
+++ b/python/pyspark/shell.py
@@ -35,7 +35,7 @@
       ____              __
      / __/__  ___ _____/ /__
     _\ \/ _ \/ _ `/ __/  '_/
-   /__ / .__/\_,_/_/ /_/\_\   version 0.9.0-SNAPSHOT
+   /__ / .__/\_,_/_/ /_/\_\   version 0.9.0
       /_/
 """
 print "Using Python version %s (%s, %s)" % (
diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
index 7acb6eaf10931..527104587fd31 100644
--- a/python/pyspark/tests.py
+++ b/python/pyspark/tests.py
@@ -152,6 +152,33 @@ def test_save_as_textfile_with_unicode(self):
         raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
         self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))
 
+    def test_transforming_cartesian_result(self):
+        # Regression test for SPARK-1034
+        rdd1 = self.sc.parallelize([1, 2])
+        rdd2 = self.sc.parallelize([3, 4])
+        cart = rdd1.cartesian(rdd2)
+        result = cart.map(lambda (x, y): x + y).collect()
+
+    def test_cartesian_on_textfile(self):
+        # Regression test for
+        path = os.path.join(SPARK_HOME, "python/test_support/hello.txt")
+        a = self.sc.textFile(path)
+        result = a.cartesian(a).collect()
+        (x, y) = result[0]
+        self.assertEqual("Hello World!", x.strip())
+        self.assertEqual("Hello World!", y.strip())
+
+    def test_deleting_input_files(self):
+        # Regression test for SPARK-1025
+        tempFile = NamedTemporaryFile(delete=False)
+        tempFile.write("Hello World!")
+        tempFile.close()
+        data = self.sc.textFile(tempFile.name)
+        filtered_data = data.filter(lambda x: True)
+        self.assertEqual(1, filtered_data.count())
+        os.unlink(tempFile.name)
+        self.assertRaises(Exception, lambda: filtered_data.count())
+
 
 class TestIO(PySparkTestCase):
 
diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py
index d77981f61fa36..4e47d02965c4d 100644
--- a/python/pyspark/worker.py
+++ b/python/pyspark/worker.py
@@ -30,11 +30,11 @@
 from pyspark.cloudpickle import CloudPickler
 from pyspark.files import SparkFiles
 from pyspark.serializers import write_with_length, write_int, read_long, \
-    write_long, read_int, SpecialLengths, MUTF8Deserializer, PickleSerializer
+    write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer
 
 
 pickleSer = PickleSerializer()
-mutf8_deserializer = MUTF8Deserializer()
+utf8_deserializer = UTF8Deserializer()
 
 
 def report_times(outfile, boot, init, finish):
@@ -45,34 +45,34 @@ def report_times(outfile, boot, init, finish):
 
 
 def main(infile, outfile):
-    boot_time = time.time()
-    split_index = read_int(infile)
-    if split_index == -1:  # for unit tests
-        return
+    try:
+        boot_time = time.time()
+        split_index = read_int(infile)
+        if split_index == -1:  # for unit tests
+            return
 
-    # fetch name of workdir
-    spark_files_dir = mutf8_deserializer.loads(infile)
-    SparkFiles._root_directory = spark_files_dir
-    SparkFiles._is_running_on_worker = True
+        # fetch name of workdir
+        spark_files_dir = utf8_deserializer.loads(infile)
+        SparkFiles._root_directory = spark_files_dir
+        SparkFiles._is_running_on_worker = True
 
-    # fetch names and values of broadcast variables
-    num_broadcast_variables = read_int(infile)
-    for _ in range(num_broadcast_variables):
-        bid = read_long(infile)
-        value = pickleSer._read_with_length(infile)
-        _broadcastRegistry[bid] = Broadcast(bid, value)
+        # fetch names and values of broadcast variables
+        num_broadcast_variables = read_int(infile)
+        for _ in range(num_broadcast_variables):
+            bid = read_long(infile)
+            value = pickleSer._read_with_length(infile)
+            _broadcastRegistry[bid] = Broadcast(bid, value)
 
-    # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
-    sys.path.append(spark_files_dir) # *.py files that were added will be copied here
-    num_python_includes =  read_int(infile)
-    for _ in range(num_python_includes):
-        filename = mutf8_deserializer.loads(infile)
-        sys.path.append(os.path.join(spark_files_dir, filename))
+        # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH
+        sys.path.append(spark_files_dir) # *.py files that were added will be copied here
+        num_python_includes =  read_int(infile)
+        for _ in range(num_python_includes):
+            filename = utf8_deserializer.loads(infile)
+            sys.path.append(os.path.join(spark_files_dir, filename))
 
-    command = pickleSer._read_with_length(infile)
-    (func, deserializer, serializer) = command
-    init_time = time.time()
-    try:
+        command = pickleSer._read_with_length(infile)
+        (func, deserializer, serializer) = command
+        init_time = time.time()
         iterator = deserializer.load_stream(infile)
         serializer.dump_stream(func(split_index, iterator), outfile)
     except Exception as e:
diff --git a/python/run-tests b/python/run-tests
index 2005f610b43b4..a986ac9380be4 100755
--- a/python/run-tests
+++ b/python/run-tests
@@ -40,11 +40,11 @@ run_test "-m doctest pyspark/broadcast.py"
 run_test "-m doctest pyspark/accumulators.py"
 run_test "-m doctest pyspark/serializers.py"
 run_test "pyspark/tests.py"
-#run_test "pyspark/mllib/_common.py"
-#run_test "pyspark/mllib/classification.py"
-#run_test "pyspark/mllib/clustering.py"
-#run_test "pyspark/mllib/recommendation.py"
-#run_test "pyspark/mllib/regression.py"
+run_test "pyspark/mllib/_common.py"
+run_test "pyspark/mllib/classification.py"
+run_test "pyspark/mllib/clustering.py"
+run_test "pyspark/mllib/recommendation.py"
+run_test "pyspark/mllib/regression.py"
 
 if [[ $FAILED != 0 ]]; then
     echo -en "\033[31m"  # Red
diff --git a/repl-bin/pom.xml b/repl-bin/pom.xml
deleted file mode 100644
index 869dbdb9b095a..0000000000000
--- a/repl-bin/pom.xml
+++ /dev/null
@@ -1,184 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-  ~ Licensed to the Apache Software Foundation (ASF) under one or more
-  ~ contributor license agreements.  See the NOTICE file distributed with
-  ~ this work for additional information regarding copyright ownership.
-  ~ The ASF licenses this file to You under the Apache License, Version 2.0
-  ~ (the "License"); you may not use this file except in compliance with
-  ~ the License.  You may obtain a copy of the License at
-  ~
-  ~    http://www.apache.org/licenses/LICENSE-2.0
-  ~
-  ~ Unless required by applicable law or agreed to in writing, software
-  ~ distributed under the License is distributed on an "AS IS" BASIS,
-  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  ~ See the License for the specific language governing permissions and
-  ~ limitations under the License.
-  -->
-
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
-  <modelVersion>4.0.0</modelVersion>
-  <parent>
-    <groupId>org.apache.spark</groupId>
-    <artifactId>spark-parent</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
-    <relativePath>../pom.xml</relativePath>
-  </parent>
-
-  <groupId>org.apache.spark</groupId>
-  <artifactId>spark-repl-bin_2.10</artifactId>
-  <packaging>pom</packaging>
-  <name>Spark Project REPL binary packaging</name>
-  <url>http://spark.incubator.apache.org/</url>
-
-  <properties>
-    <deb.pkg.name>spark</deb.pkg.name>
-    <deb.install.path>/usr/share/spark</deb.install.path>
-    <deb.user>root</deb.user>
-  </properties>
-
-  <dependencies>
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-core_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-bagel_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
-      <scope>runtime</scope>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.spark</groupId>
-      <artifactId>spark-repl_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
-      <scope>runtime</scope>
-    </dependency>
-  </dependencies>
-
-  <build>
-    <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-shade-plugin</artifactId>
-        <configuration>
-          <shadedArtifactAttached>false</shadedArtifactAttached>
-          <outputFile>${project.build.directory}/${project.artifactId}-${project.version}-shaded.jar</outputFile>
-          <artifactSet>
-            <includes>
-              <include>*:*</include>
-            </includes>
-          </artifactSet>
-          <filters>
-            <filter>
-              <artifact>*:*</artifact>
-              <excludes>
-                <exclude>META-INF/*.SF</exclude>
-                <exclude>META-INF/*.DSA</exclude>
-                <exclude>META-INF/*.RSA</exclude>
-              </excludes>
-            </filter>
-          </filters>
-        </configuration>
-        <executions>
-          <execution>
-            <phase>package</phase>
-            <goals>
-              <goal>shade</goal>
-            </goals>
-            <configuration>
-              <transformers>
-                <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
-                <transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
-                  <resource>reference.conf</resource>
-                </transformer>
-                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
-                  <mainClass>spark.repl.Main</mainClass>
-                </transformer>
-              </transformers>
-            </configuration>
-          </execution>
-        </executions>
-      </plugin>
-    </plugins>
-  </build>
-
-  <profiles>
-    <profile>
-      <id>deb</id>
-      <build>
-        <plugins>
-          <plugin>
-            <groupId>org.codehaus.mojo</groupId>
-            <artifactId>buildnumber-maven-plugin</artifactId>
-            <version>1.1</version>
-            <executions>
-              <execution>
-                <phase>validate</phase>
-                <goals>
-                  <goal>create</goal>
-                </goals>
-                <configuration>
-                  <shortRevisionLength>8</shortRevisionLength>
-                </configuration>
-              </execution>
-            </executions>
-          </plugin>
-          <plugin>
-            <groupId>org.vafer</groupId>
-            <artifactId>jdeb</artifactId>
-            <version>0.11</version>
-            <executions>
-              <execution>
-                <phase>package</phase>
-                <goals>
-                  <goal>jdeb</goal>
-                </goals>
-                <configuration>
-                  <deb>${project.build.directory}/${deb.pkg.name}_${project.version}-${buildNumber}_all.deb</deb>
-                  <attach>false</attach>
-                  <compression>gzip</compression>
-                  <dataSet>
-                    <data>
-                      <src>${project.build.directory}/${project.artifactId}-${project.version}-shaded.jar</src>
-                      <type>file</type>
-                      <mapper>
-                        <type>perm</type>
-                        <user>${deb.user}</user>
-                        <group>${deb.user}</group>
-                        <prefix>${deb.install.path}</prefix>
-                      </mapper>
-                    </data>
-                    <data>
-                      <src>${basedir}/src/deb/bin</src>
-                      <type>directory</type>
-                      <mapper>
-                        <type>perm</type>
-                        <user>${deb.user}</user>
-                        <group>${deb.user}</group>
-                        <prefix>${deb.install.path}</prefix>
-                        <filemode>744</filemode>
-                      </mapper>
-                    </data>
-                    <data>
-                      <src>${basedir}/../conf</src>
-                      <type>directory</type>
-                      <mapper>
-                        <type>perm</type>
-                        <user>${deb.user}</user>
-                        <group>${deb.user}</group>
-                        <prefix>${deb.install.path}/conf</prefix>
-                        <filemode>744</filemode>
-                      </mapper>
-                    </data>
-                  </dataSet>
-                </configuration>
-              </execution>
-            </executions>
-          </plugin>
-        </plugins>
-      </build>
-    </profile>
-  </profiles>
-</project>
diff --git a/repl-bin/src/deb/bin/run b/repl-bin/src/deb/bin/run
deleted file mode 100755
index 3a6f22f41fca5..0000000000000
--- a/repl-bin/src/deb/bin/run
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env bash
-
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-SCALA_VERSION=2.10
-
-# Figure out where the Scala framework is installed
-FWDIR="$(cd `dirname $0`; pwd)"
-
-# Export this as SPARK_HOME
-export SPARK_HOME="$FWDIR"
-
-# Load environment variables from conf/spark-env.sh, if it exists
-if [ -e $FWDIR/conf/spark-env.sh ] ; then
-  . $FWDIR/conf/spark-env.sh
-fi
-
-# Figure out how much memory to use per executor and set it as an environment
-# variable so that our process sees it and can report it to Mesos
-if [ -z "$SPARK_MEM" ] ; then
-  SPARK_MEM="512m"
-fi
-export SPARK_MEM
-
-# Set JAVA_OPTS to be able to load native libraries and to set heap size
-JAVA_OPTS="$SPARK_JAVA_OPTS"
-JAVA_OPTS+=" -Djava.library.path=$SPARK_LIBRARY_PATH"
-JAVA_OPTS+=" -Xms$SPARK_MEM -Xmx$SPARK_MEM"
-# Load extra JAVA_OPTS from conf/java-opts, if it exists
-if [ -e $FWDIR/conf/java-opts ] ; then
-  JAVA_OPTS+=" `cat $FWDIR/conf/java-opts`"
-fi
-export JAVA_OPTS
-
-# Build up classpath
-CLASSPATH=":$FWDIR/conf"
-for jar in `find $FWDIR -name '*jar'`; do
-  CLASSPATH+=":$jar"
-done
-export CLASSPATH
-
-exec java -Dscala.usejavacp=true -Djline.shutdownhook=true -cp "$CLASSPATH" $JAVA_OPTS $EXTRA_ARGS "$@"
diff --git a/repl/pom.xml b/repl/pom.xml
index 2dfe7ac900b83..346c672165d7d 100644
--- a/repl/pom.xml
+++ b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.1-incubating-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala
index 87d94d51be199..f262faa597b8f 100644
--- a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala
+++ b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -180,8 +180,13 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
 
   /** Create a new interpreter. */
   def createInterpreter() {
-    if (addedClasspath != "")
-      settings.classpath append addedClasspath
+    require(settings != null)
+
+    if (addedClasspath != "") settings.classpath.append(addedClasspath)
+    // work around for Scala bug
+    val totalClassPath = SparkILoop.getAddedJars.foldLeft(
+      settings.classpath.value)((l, r) => ClassPath.join(l, r))
+    this.settings.classpath.value = totalClassPath
 
     intp = new SparkILoopInterpreter
   }
diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala b/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
index 21b1ba305d110..ab5e283d65f07 100644
--- a/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
+++ b/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala
@@ -24,7 +24,7 @@ trait SparkILoopInit {
       ____              __
      / __/__  ___ _____/ /__
     _\ \/ _ \/ _ `/ __/  '_/
-   /___/ .__/\_,_/_/ /_/\_\   version 0.9.0-SNAPSHOT
+   /___/ .__/\_,_/_/ /_/\_\   version 0.9.0
       /_/
 """)
     import Properties._
diff --git a/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
index 8aad27366524a..8203b8f6122e1 100644
--- a/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
+++ b/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.repl
 
 import java.io._
diff --git a/sbin/spark-daemon.sh b/sbin/spark-daemon.sh
index 2be2b3d7c0933..5a21ebe8a4ce9 100755
--- a/sbin/spark-daemon.sh
+++ b/sbin/spark-daemon.sh
@@ -147,7 +147,7 @@ case $startStop in
     spark_rotate_log "$log"
     echo starting $command, logging to $log
     cd "$SPARK_PREFIX"
-    nohup nice -n $SPARK_NICENESS "$SPARK_PREFIX"/bin/spark-class $command "$@" >> "$log" 2>&1 < /dev/null &
+    nohup nice -n $SPARK_NICENESS ./bin/spark-class $command "$@" >> "$log" 2>&1 < /dev/null &
     newpid=$!
     echo $newpid > $pid
     sleep 2
diff --git a/streaming/pom.xml b/streaming/pom.xml
index 459756912dbe5..7e10ef6f471be 100644
--- a/streaming/pom.xml
+++ b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.1-incubating-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
index 5046a1d53fa41..4d778dc4d43b4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala
@@ -42,11 +42,13 @@ class Checkpoint(@transient ssc: StreamingContext, val checkpointTime: Time)
   val checkpointDuration = ssc.checkpointDuration
   val pendingTimes = ssc.scheduler.getPendingTimes().toArray
   val delaySeconds = MetadataCleaner.getDelaySeconds(ssc.conf)
-  val sparkConf = ssc.conf
+  val sparkConfPairs = ssc.conf.getAll
 
-  // These should be unset when a checkpoint is deserialized,
-  // otherwise the SparkContext won't initialize correctly.
-  sparkConf.remove("spark.driver.host").remove("spark.driver.port")
+  def sparkConf = {
+    new SparkConf(false).setAll(sparkConfPairs)
+      .remove("spark.driver.host")
+      .remove("spark.driver.port")
+  }
 
   def validate() {
     assert(master != null, "Checkpoint.master is null")
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala b/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala
index 1f5dacb543db8..86753360a07e4 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala
@@ -1,3 +1,20 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
 package org.apache.spark.streaming
 
 private[streaming] class ContextWaiter {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
index 8faa79f8c7e9d..0683113bd0b51 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala
@@ -163,8 +163,10 @@ final private[streaming] class DStreamGraph extends Serializable with Logging {
     logDebug("DStreamGraph.writeObject used")
     this.synchronized {
       checkpointInProgress = true
+      logDebug("Enabled checkpoint mode")
       oos.defaultWriteObject()
       checkpointInProgress = false
+      logDebug("Disabled checkpoint mode")
     }
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
index 26257e652e537..5847b95e3f5d1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala
@@ -42,9 +42,15 @@ import org.apache.spark.streaming.scheduler._
 import org.apache.hadoop.conf.Configuration
 
 /**
- * A StreamingContext is the main entry point for Spark Streaming functionality. Besides the basic
- * information (such as, cluster URL and job name) to internally create a SparkContext, it provides
- * methods used to create DStream from various input sources.
+ * Main entry point for Spark Streaming functionality. It provides methods used to create
+ * [[org.apache.spark.streaming.dstream.DStream]]s from various input sources. It can be either
+ * created by providing a Spark master URL and an appName, or from a org.apache.spark.SparkConf
+ * configuration (see core Spark documentation), or from an existing org.apache.spark.SparkContext.
+ * The associated SparkContext can be accessed using `context.sparkContext`. After
+ * creating and transforming DStreams, the streaming computation can be started and stopped
+ * using `context.start()` and `context.stop()`, respectively.
+ * `context.awaitTransformation()` allows the current thread to wait for the termination
+ * of the context by `stop()` or by an exception.
  */
 class StreamingContext private[streaming] (
     sc_ : SparkContext,
@@ -63,7 +69,7 @@ class StreamingContext private[streaming] (
 
   /**
    * Create a StreamingContext by providing the configuration necessary for a new SparkContext.
-   * @param conf a [[org.apache.spark.SparkConf]] object specifying Spark parameters
+   * @param conf a org.apache.spark.SparkConf object specifying Spark parameters
    * @param batchDuration the time interval at which streaming data will be divided into batches
    */
   def this(conf: SparkConf, batchDuration: Duration) = {
@@ -88,7 +94,7 @@ class StreamingContext private[streaming] (
   }
 
   /**
-   * Re-create a StreamingContext from a checkpoint file.
+   * Recreate a StreamingContext from a checkpoint file.
    * @param path Path to the directory that was specified as the checkpoint directory
    * @param hadoopConf Optional, configuration object if necessary for reading from
    *                   HDFS compatible filesystems
@@ -151,6 +157,7 @@ class StreamingContext private[streaming] (
   private[streaming] val scheduler = new JobScheduler(this)
 
   private[streaming] val waiter = new ContextWaiter
+
   /**
    * Return the associated Spark context
    */
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStream.scala
index c92854ccd9a28..e23b725052864 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStream.scala
@@ -27,22 +27,12 @@ import scala.reflect.ClassTag
 import org.apache.spark.streaming.dstream.DStream
 
 /**
- * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous
- * sequence of RDDs (of the same type) representing a continuous stream of data (see [[org.apache.spark.rdd.RDD]]
- * for more details on RDDs). DStreams can either be created from live data (such as, data from
- * HDFS, Kafka or Flume) or it can be generated by transformation existing DStreams using operations
- * such as `map`, `window` and `reduceByKeyAndWindow`. While a Spark Streaming program is running, each
- * DStream periodically generates a RDD, either from live data or by transforming the RDD generated
- * by a parent DStream.
- *
- * This class contains the basic operations available on all DStreams, such as `map`, `filter` and
- * `window`. In addition, [[org.apache.spark.streaming.api.java.JavaPairDStream]] contains operations available
- * only on DStreams of key-value pairs, such as `groupByKeyAndWindow` and `join`.
- *
- * DStreams internally is characterized by a few basic properties:
- *  - A list of other DStreams that the DStream depends on
- *  - A time interval at which the DStream generates an RDD
- *  - A function that is used to generate an RDD after each time interval
+ * A Java-friendly interface to [[org.apache.spark.streaming.dstream.DStream]], the basic
+ * abstraction in Spark Streaming that represents a continuous stream of data.
+ * DStreams can either be created from live data (such as, data from TCP sockets, Kafka, Flume,
+ * etc.) or it can be generated by transforming existing DStreams using operations such as `map`,
+ * `window`. For operations applicable to key-value pair DStreams, see
+ * [[org.apache.spark.streaming.api.java.JavaPairDStream]].
  */
 class JavaDStream[T](val dstream: DStream[T])(implicit val classTag: ClassTag[T])
     extends JavaDStreamLike[T, JavaDStream[T], JavaRDD[T]] {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
index a493a8279f942..64fe204cdf7a5 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala
@@ -138,7 +138,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
 
   /** Return a new DStream by applying a function to all elements of this DStream. */
   def map[K2, V2](f: PairFunction[T, K2, V2]): JavaPairDStream[K2, V2] = {
-    def cm = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[Tuple2[K2, V2]]]
+    def cm = implicitly[ClassTag[Tuple2[_, _]]].asInstanceOf[ClassTag[Tuple2[K2, V2]]]
     new JavaPairDStream(dstream.map(f)(cm))(f.keyType(), f.valueType())
   }
 
@@ -159,7 +159,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T
   def flatMap[K2, V2](f: PairFlatMapFunction[T, K2, V2]): JavaPairDStream[K2, V2] = {
     import scala.collection.JavaConverters._
     def fn = (x: T) => f.apply(x).asScala
-    def cm = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[Tuple2[K2, V2]]]
+    def cm = implicitly[ClassTag[Tuple2[_, _]]].asInstanceOf[ClassTag[Tuple2[K2, V2]]]
     new JavaPairDStream(dstream.flatMap(fn)(cm))(f.keyType(), f.valueType())
   }
 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
index 6bb985ca540ff..62cfa0a229db1 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala
@@ -37,6 +37,10 @@ import org.apache.spark.rdd.RDD
 import org.apache.spark.rdd.PairRDDFunctions
 import org.apache.spark.streaming.dstream.DStream
 
+/**
+ * A Java-friendly interface to a DStream of key-value pairs, which provides extra methods
+ * like `reduceByKey` and `join`.
+ */
 class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
     implicit val kManifest: ClassTag[K],
     implicit val vManifest: ClassTag[V])
@@ -741,7 +745,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])(
   }
 
   override val classTag: ClassTag[(K, V)] =
-    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[Tuple2[K, V]]]
+    implicitly[ClassTag[Tuple2[_, _]]].asInstanceOf[ClassTag[Tuple2[K, V]]]
 }
 
 object JavaPairDStream {
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
index 4edf8fa13a205..921b56143af25 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala
@@ -22,7 +22,6 @@ import scala.collection.JavaConversions._
 import scala.reflect.ClassTag
 
 import java.io.InputStream
-import java.lang.{Integer => JInt}
 import java.util.{List => JList, Map => JMap}
 
 import akka.actor.{Props, SupervisorStrategy}
@@ -39,19 +38,20 @@ import org.apache.hadoop.conf.Configuration
 import org.apache.spark.streaming.dstream.DStream
 
 /**
- * A StreamingContext is the main entry point for Spark Streaming functionality. Besides the basic
- * information (such as, cluster URL and job name) to internally create a SparkContext, it provides
- * methods used to create DStream from various input sources.
+ * A Java-friendly version of [[org.apache.spark.streaming.StreamingContext]] which is the main
+ * entry point for Spark Streaming functionality. It provides methods to create
+ * [[org.apache.spark.streaming.api.java.JavaDStream]] and
+ * [[org.apache.spark.streaming.api.java.JavaPairDStream.]] from input sources. The internal
+ * org.apache.spark.api.java.JavaSparkContext (see core Spark documentation) can be accessed
+ * using `context.sparkContext`. After creating and transforming DStreams, the streaming
+ * computation can be started and stopped using `context.start()` and `context.stop()`,
+ * respectively. `context.awaitTransformation()` allows the current thread to wait for the
+ * termination of a context by `stop()` or by an exception.
  */
 class JavaStreamingContext(val ssc: StreamingContext) {
 
-  // TODOs:
-  // - Test to/from Hadoop functions
-  // - Support creating and registering InputStreams
-
-
   /**
-   * Creates a StreamingContext.
+   * Create a StreamingContext.
    * @param master Name of the Spark Master
    * @param appName Name to be used when registering with the scheduler
    * @param batchDuration The time interval at which streaming data will be divided into batches
@@ -60,7 +60,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
     this(new StreamingContext(master, appName, batchDuration, null, Nil, Map()))
 
   /**
-   * Creates a StreamingContext.
+   * Create a StreamingContext.
    * @param master Name of the Spark Master
    * @param appName Name to be used when registering with the scheduler
    * @param batchDuration The time interval at which streaming data will be divided into batches
@@ -77,7 +77,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
     this(new StreamingContext(master, appName, batchDuration, sparkHome, Seq(jarFile), Map()))
 
   /**
-   * Creates a StreamingContext.
+   * Create a StreamingContext.
    * @param master Name of the Spark Master
    * @param appName Name to be used when registering with the scheduler
    * @param batchDuration The time interval at which streaming data will be divided into batches
@@ -94,7 +94,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
     this(new StreamingContext(master, appName, batchDuration, sparkHome, jars, Map()))
 
   /**
-   * Creates a StreamingContext.
+   * Create a StreamingContext.
    * @param master Name of the Spark Master
    * @param appName Name to be used when registering with the scheduler
    * @param batchDuration The time interval at which streaming data will be divided into batches
@@ -113,7 +113,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
     this(new StreamingContext(master, appName, batchDuration, sparkHome, jars, environment))
 
   /**
-   * Creates a StreamingContext using an existing SparkContext.
+   * Create a JavaStreamingContext using an existing JavaSparkContext.
    * @param sparkContext The underlying JavaSparkContext to use
    * @param batchDuration The time interval at which streaming data will be divided into batches
    */
@@ -121,7 +121,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
     this(new StreamingContext(sparkContext.sc, batchDuration))
 
   /**
-   * Creates a StreamingContext using an existing SparkContext.
+   * Create a JavaStreamingContext using a SparkConf configuration.
    * @param conf A Spark application configuration
    * @param batchDuration The time interval at which streaming data will be divided into batches
    */
@@ -129,23 +129,26 @@ class JavaStreamingContext(val ssc: StreamingContext) {
     this(new StreamingContext(conf, batchDuration))
 
   /**
-   * Re-creates a StreamingContext from a checkpoint file.
+   * Recreate a JavaStreamingContext from a checkpoint file.
    * @param path Path to the directory that was specified as the checkpoint directory
    */
   def this(path: String) = this(new StreamingContext(path, new Configuration))
 
   /**
-   * Re-creates a StreamingContext from a checkpoint file.
+   * Re-creates a JavaStreamingContext from a checkpoint file.
    * @param path Path to the directory that was specified as the checkpoint directory
    *
    */
   def this(path: String, hadoopConf: Configuration) = this(new StreamingContext(path, hadoopConf))
 
+  @deprecated("use sparkContext", "0.9.0")
+  val sc: JavaSparkContext = sparkContext
+
   /** The underlying SparkContext */
-  val sc: JavaSparkContext = new JavaSparkContext(ssc.sc)
+  val sparkContext = new JavaSparkContext(ssc.sc)
 
   /**
-   * Create a input stream from network source hostname:port. Data is received using
+   * Create an input stream from network source hostname:port. Data is received using
    * a TCP socket and the receive bytes is interpreted as UTF8 encoded \n delimited
    * lines.
    * @param hostname      Hostname to connect to for receiving data
@@ -158,7 +161,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
   }
 
   /**
-   * Create a input stream from network source hostname:port. Data is received using
+   * Create an input stream from network source hostname:port. Data is received using
    * a TCP socket and the receive bytes is interpreted as UTF8 encoded \n delimited
    * lines. Storage level of the data will be the default StorageLevel.MEMORY_AND_DISK_SER_2.
    * @param hostname      Hostname to connect to for receiving data
@@ -169,7 +172,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
   }
 
   /**
-   * Create a input stream from network source hostname:port. Data is received using
+   * Create an input stream from network source hostname:port. Data is received using
    * a TCP socket and the receive bytes it interepreted as object using the given
    * converter.
    * @param hostname      Hostname to connect to for receiving data
@@ -191,7 +194,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
   }
 
   /**
-   * Create a input stream that monitors a Hadoop-compatible filesystem
+   * Create an input stream that monitors a Hadoop-compatible filesystem
    * for new files and reads them as text files (using key as LongWritable, value
    * as Text and input format as TextInputFormat). Files must be written to the
    * monitored directory by "moving" them from another location within the same
@@ -203,7 +206,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
   }
 
   /**
-   * Create a input stream from network source hostname:port, where data is received
+   * Create an input stream from network source hostname:port, where data is received
    * as serialized blocks (serialized using the Spark's serializer) that can be directly
    * pushed into the block manager without deserializing them. This is the most efficient
    * way to receive data.
@@ -222,7 +225,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
   }
 
   /**
-   * Create a input stream from network source hostname:port, where data is received
+   * Create an input stream from network source hostname:port, where data is received
    * as serialized blocks (serialized using the Spark's serializer) that can be directly
    * pushed into the block manager without deserializing them. This is the most efficient
    * way to receive data.
@@ -237,7 +240,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
   }
 
   /**
-   * Create a input stream that monitors a Hadoop-compatible filesystem
+   * Create an input stream that monitors a Hadoop-compatible filesystem
    * for new files and reads them using the given key-value types and input format.
    * Files must be written to the monitored directory by "moving" them from another
    * location within the same file system. File names starting with . are ignored.
@@ -320,7 +323,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
   }
 
   /**
-   * Creates a input stream from an queue of RDDs. In each batch,
+   * Creates an input stream from an queue of RDDs. In each batch,
    * it will process either one or all of the RDDs returned by the queue.
    *
    * NOTE: changes to the queue after the stream is created will not be recognized.
@@ -336,7 +339,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
   }
 
   /**
-   * Creates a input stream from an queue of RDDs. In each batch,
+   * Creates an input stream from an queue of RDDs. In each batch,
    * it will process either one or all of the RDDs returned by the queue.
    *
    * NOTE: changes to the queue after the stream is created will not be recognized.
@@ -353,7 +356,7 @@ class JavaStreamingContext(val ssc: StreamingContext) {
   }
 
   /**
-   * Creates a input stream from an queue of RDDs. In each batch,
+   * Creates an input stream from an queue of RDDs. In each batch,
    * it will process either one or all of the RDDs returned by the queue.
    *
    * NOTE: changes to the queue after the stream is created will not be recognized.
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
index 71a4c5c93e76a..6bff56a9d332a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala
@@ -37,8 +37,9 @@ import org.apache.spark.streaming.Duration
  * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous
  * sequence of RDDs (of the same type) representing a continuous stream of data (see
  * org.apache.spark.rdd.RDD in the Spark core documentation for more details on RDDs).
- * DStreams can either be created from live data (such as, data from Kafka, Flume, sockets, HDFS)
- * or it can be generated by transforming existing DStreams using operations such as `map`,
+ * DStreams can either be created from live data (such as, data from TCP sockets, Kafka, Flume,
+ * etc.) using a [[org.apache.spark.streaming.StreamingContext]] or it can be generated by
+ * transforming existing DStreams using operations such as `map`,
  * `window` and `reduceByKeyAndWindow`. While a Spark Streaming program is running, each DStream
  * periodically generates a RDD, either from live data or by transforming the RDD generated by a
  * parent DStream.
@@ -540,7 +541,6 @@ abstract class DStream[T: ClassTag] (
    * on each RDD of 'this' DStream.
    */
   def transform[U: ClassTag](transformFunc: (RDD[T], Time) => RDD[U]): DStream[U] = {
-    //new TransformedDStream(this, context.sparkContext.clean(transformFunc))
     val cleanedF = context.sparkContext.clean(transformFunc)
     val realTransformFunc =  (rdds: Seq[RDD[_]], time: Time) => {
       assert(rdds.length == 1)
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala
index 38bad5ac8042a..906a16e508cd8 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala
@@ -19,7 +19,7 @@ package org.apache.spark.streaming.dstream
 
 import scala.collection.mutable.HashMap
 import scala.reflect.ClassTag
-import java.io.{ObjectInputStream, IOException}
+import java.io.{ObjectOutputStream, ObjectInputStream, IOException}
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.fs.FileSystem
 import org.apache.spark.Logging
@@ -117,8 +117,32 @@ class DStreamCheckpointData[T: ClassTag] (dstream: DStream[T])
     "[\n" + currentCheckpointFiles.size + " checkpoint files \n" + currentCheckpointFiles.mkString("\n") + "\n]"
   }
 
+  @throws(classOf[IOException])
+  private def writeObject(oos: ObjectOutputStream) {
+    logDebug(this.getClass().getSimpleName + ".writeObject used")
+    if (dstream.context.graph != null) {
+      dstream.context.graph.synchronized {
+        if (dstream.context.graph.checkpointInProgress) {
+          oos.defaultWriteObject()
+        } else {
+          val msg = "Object of " + this.getClass.getName + " is being serialized " +
+            " possibly as a part of closure of an RDD operation. This is because " +
+            " the DStream object is being referred to from within the closure. " +
+            " Please rewrite the RDD operation inside this DStream to avoid this. " +
+            " This has been enforced to avoid bloating of Spark tasks " +
+            " with unnecessary objects."
+          throw new java.io.NotSerializableException(msg)
+        }
+      }
+    } else {
+      throw new java.io.NotSerializableException(
+        "Graph is unexpectedly null when DStream is being serialized.")
+    }
+  }
+
   @throws(classOf[IOException])
   private def readObject(ois: ObjectInputStream) {
+    logDebug(this.getClass().getSimpleName + ".readObject used")
     ois.defaultReadObject()
     timeToOldestCheckpointFileTime = new HashMap[Time, Time]
     timeToCheckpointFile = new HashMap[Time, String]
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
index f57762321c40e..fb9df2f48eae3 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala
@@ -18,20 +18,17 @@
 package org.apache.spark.streaming.dstream
 
 import org.apache.spark.streaming.StreamingContext._
-import org.apache.spark.streaming.dstream._
 
 import org.apache.spark.{Partitioner, HashPartitioner}
 import org.apache.spark.SparkContext._
-import org.apache.spark.rdd.{ClassTags, RDD, PairRDDFunctions}
-import org.apache.spark.storage.StorageLevel
+import org.apache.spark.rdd.RDD
 
 import scala.collection.mutable.ArrayBuffer
-import scala.reflect.{ClassTag, classTag}
+import scala.reflect.ClassTag
 
-import org.apache.hadoop.mapred.{JobConf, OutputFormat}
+import org.apache.hadoop.mapred.JobConf
 import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat}
 import org.apache.hadoop.mapred.OutputFormat
-import org.apache.hadoop.security.UserGroupInformation
 import org.apache.hadoop.conf.Configuration
 import org.apache.spark.streaming.{Time, Duration}
 
@@ -108,7 +105,7 @@ extends Serializable {
   /**
    * Combine elements of each key in DStream's RDDs using custom functions. This is similar to the
    * combineByKey for RDDs. Please refer to combineByKey in
-   * [[org.apache.spark.rdd.PairRDDFunctions]] for more information.
+   * org.apache.spark.rdd.PairRDDFunctions in the Spark core documentation for more information.
    */
   def combineByKey[C: ClassTag](
     createCombiner: V => C,
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receivers/ActorReceiver.scala b/streaming/src/main/scala/org/apache/spark/streaming/receivers/ActorReceiver.scala
index fdf5371a89587..79ed696814f07 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/receivers/ActorReceiver.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/receivers/ActorReceiver.scala
@@ -44,40 +44,49 @@ object ReceiverSupervisorStrategy {
 
 /**
  * A receiver trait to be mixed in with your Actor to gain access to
- * pushBlock API.
+ * the API for pushing received data into Spark Streaming for being processed.
  *
  * Find more details at: http://spark-project.org/docs/latest/streaming-custom-receivers.html
  * 
  * @example {{{
  *  class MyActor extends Actor with Receiver{
  *      def receive {
- *          case anything :String => pushBlock(anything)
+ *          case anything: String => pushBlock(anything)
  *      }
  *  }
- *  //Can be plugged in actorStream as follows
+ *
+ *  // Can be used with an actorStream as follows
  *  ssc.actorStream[String](Props(new MyActor),"MyActorReceiver")
  *
  * }}}
  *
- * @note An important point to note:
- *       Since Actor may exist outside the spark framework, It is thus user's responsibility
+ * @note Since Actor may exist outside the spark framework, It is thus user's responsibility
  *       to ensure the type safety, i.e parametrized type of push block and InputDStream
  *       should be same.
- *
  */
-trait Receiver { self: Actor ⇒
+trait Receiver {
+
+  self: Actor ⇒ // to ensure that this can be added to Actor classes only
+
+  /**
+   * Push an iterator received data into Spark Streaming for processing
+   */
   def pushBlock[T: ClassTag](iter: Iterator[T]) {
     context.parent ! Data(iter)
   }
 
+  /**
+   * Push a single item of received data into Spark Streaming for processing
+   */
   def pushBlock[T: ClassTag](data: T) {
     context.parent ! Data(data)
   }
-
 }
 
 /**
- * Statistics for querying the supervisor about state of workers
+ * Statistics for querying the supervisor about state of workers. Used in
+ * conjunction with `StreamingContext.actorStream` and
+ * [[org.apache.spark.streaming.receivers.Receiver]].
  */
 case class Statistics(numberOfMsgs: Int,
   numberOfWorkers: Int,
@@ -96,17 +105,15 @@ private[streaming] case class Data[T: ClassTag](data: T)
  * his own Actor to run as receiver for Spark Streaming input source.
  *
  * This starts a supervisor actor which starts workers and also provides
- *  [http://doc.akka.io/docs/akka/2.0.5/scala/fault-tolerance.html fault-tolerance].
+ * [http://doc.akka.io/docs/akka/snapshot/scala/fault-tolerance.html fault-tolerance].
  *
- *  Here's a way to start more supervisor/workers as its children.
+ * Here's a way to start more supervisor/workers as its children.
  *
  * @example {{{
  *  context.parent ! Props(new Supervisor)
  * }}} OR {{{
- *  context.parent ! Props(new Worker,"Worker")
+ *  context.parent ! Props(new Worker, "Worker")
  * }}}
- *
- *
  */
 private[streaming] class ActorReceiver[T: ClassTag](
   props: Props,
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RateLimitedOutputStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RateLimitedOutputStream.scala
index b9c0596378b4f..179fd7593982c 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/RateLimitedOutputStream.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RateLimitedOutputStream.scala
@@ -22,6 +22,7 @@ import scala.annotation.tailrec
 import java.io.OutputStream
 import java.util.concurrent.TimeUnit._
 
+private[streaming]
 class RateLimitedOutputStream(out: OutputStream, bytesPerSec: Int) extends OutputStream {
   val SYNC_INTERVAL = NANOSECONDS.convert(10, SECONDS)
   val CHUNK_SIZE = 8192
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala
index 5b6c048a39620..07021ebb5802a 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala
@@ -22,6 +22,7 @@ import org.apache.spark.SparkContext._
 import it.unimi.dsi.fastutil.objects.{Object2LongOpenHashMap => OLMap}
 import scala.collection.JavaConversions.mapAsScalaMap
 
+private[streaming]
 object RawTextHelper {
 
   /** 
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala
index 463617a713b22..684b38e8b3102 100644
--- a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala
+++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala
@@ -33,6 +33,7 @@ import org.apache.spark.util.IntParam
  * A helper program that sends blocks of Kryo-serialized text strings out on a socket at a
  * specified rate. Used to feed data into RawInputDStream.
  */
+private[streaming]
 object RawTextSender extends Logging {
   def main(args: Array[String]) {
     if (args.length != 4) {
diff --git a/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
index 8b7d7709bf2c5..4fbbce9b8b90e 100644
--- a/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
+++ b/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java
@@ -297,9 +297,9 @@ public void testQueueStream() {
         Arrays.asList(7,8,9));
 
     JavaSparkContext jsc = new JavaSparkContext(ssc.ssc().sc());
-    JavaRDD<Integer> rdd1 = ssc.sc().parallelize(Arrays.asList(1, 2, 3));
-    JavaRDD<Integer> rdd2 = ssc.sc().parallelize(Arrays.asList(4, 5, 6));
-    JavaRDD<Integer> rdd3 = ssc.sc().parallelize(Arrays.asList(7,8,9));
+    JavaRDD<Integer> rdd1 = ssc.sparkContext().parallelize(Arrays.asList(1, 2, 3));
+    JavaRDD<Integer> rdd2 = ssc.sparkContext().parallelize(Arrays.asList(4, 5, 6));
+    JavaRDD<Integer> rdd3 = ssc.sparkContext().parallelize(Arrays.asList(7,8,9));
 
     LinkedList<JavaRDD<Integer>> rdds = Lists.newLinkedList();
     rdds.add(rdd1);
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
index 89daf4758661b..831e7c1471a09 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala
@@ -151,17 +151,29 @@ class CheckpointSuite extends TestSuiteBase {
     val value = "myvalue"
     System.setProperty(key, value)
     ssc = new StreamingContext(master, framework, batchDuration)
+    val originalConf = ssc.conf
+
     val cp = new Checkpoint(ssc, Time(1000))
-    assert(!cp.sparkConf.contains("spark.driver.host"))
-    assert(!cp.sparkConf.contains("spark.driver.port"))
-    assert(!cp.sparkConf.contains("spark.hostPort"))
-    assert(cp.sparkConf.get(key) === value)
+    val cpConf = cp.sparkConf
+    assert(cpConf.get("spark.master") === originalConf.get("spark.master"))
+    assert(cpConf.get("spark.app.name") === originalConf.get("spark.app.name"))
+    assert(cpConf.get(key) === value)
     ssc.stop()
+
+    // Serialize/deserialize to simulate write to storage and reading it back
     val newCp = Utils.deserialize[Checkpoint](Utils.serialize(cp))
-    assert(!newCp.sparkConf.contains("spark.driver.host"))
-    assert(!newCp.sparkConf.contains("spark.driver.port"))
-    assert(!newCp.sparkConf.contains("spark.hostPort"))
-    assert(newCp.sparkConf.get(key) === value)
+
+    val newCpConf = newCp.sparkConf
+    assert(newCpConf.get("spark.master") === originalConf.get("spark.master"))
+    assert(newCpConf.get("spark.app.name") === originalConf.get("spark.app.name"))
+    assert(newCpConf.get(key) === value)
+    assert(!newCpConf.contains("spark.driver.host"))
+    assert(!newCpConf.contains("spark.driver.port"))
+
+    // Check if all the parameters have been restored
+    ssc = new StreamingContext(null, newCp, null)
+    val restoredConf = ssc.conf
+    assert(restoredConf.get(key) === value)
   }
 
 
diff --git a/tools/pom.xml b/tools/pom.xml
index 28f5ef14b1a35..bb8f747ae9328 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.1-incubating-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/alpha/pom.xml b/yarn/alpha/pom.xml
index 8291e9e7a36ce..349c8358ecf90 100644
--- a/yarn/alpha/pom.xml
+++ b/yarn/alpha/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>yarn-parent_2.10</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.0-incubating</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala
index 9fe4d64a0fca0..138c27910b0b0 100644
--- a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala
+++ b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala
@@ -210,7 +210,7 @@ class WorkerLauncher(args: ApplicationMasterArguments, conf: Configuration, spar
     // Wait until all containers have finished
     // TODO: This is a bit ugly. Can we make it nicer?
     // TODO: Handle container failure
-    while(yarnAllocator.getNumWorkersRunning < args.numWorkers) {
+    while ((yarnAllocator.getNumWorkersRunning < args.numWorkers) && (!driverClosed)) {
       yarnAllocator.allocateContainers(math.max(args.numWorkers - yarnAllocator.getNumWorkersRunning, 0))
       Thread.sleep(100)
     }
diff --git a/yarn/pom.xml b/yarn/pom.xml
index aea8b0cddefa2..508317b5fc01c 100644
--- a/yarn/pom.xml
+++ b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.1-incubating-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/stable/pom.xml b/yarn/stable/pom.xml
index 62fe3e274250f..04b29c76e5830 100644
--- a/yarn/stable/pom.xml
+++ b/yarn/stable/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>yarn-parent_2.10</artifactId>
-    <version>0.9.0-incubating-SNAPSHOT</version>
+    <version>0.9.1-incubating-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala
index 78353224fa4b8..40600f38e5e73 100644
--- a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala
+++ b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala
@@ -193,7 +193,7 @@ class WorkerLauncher(args: ApplicationMasterArguments, conf: Configuration, spar
     // TODO: Handle container failure
 
     yarnAllocator.addResourceRequests(args.numWorkers)
-    while (yarnAllocator.getNumWorkersRunning < args.numWorkers) {
+    while ((yarnAllocator.getNumWorkersRunning < args.numWorkers) && (!driverClosed)) {
       yarnAllocator.allocateResources()
       Thread.sleep(100)
     }
diff --git a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala
index 738ff986d85a5..1ac61124cb028 100644
--- a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala
+++ b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala
@@ -532,15 +532,15 @@ private[yarn] class YarnAllocationHandler(
       priority: Int
     ): ArrayBuffer[ContainerRequest] = {
 
-    val memoryResource = Records.newRecord(classOf[Resource])
-    memoryResource.setMemory(workerMemory + YarnAllocationHandler.MEMORY_OVERHEAD)
+    val memoryRequest = workerMemory + YarnAllocationHandler.MEMORY_OVERHEAD
+    val resource = Resource.newInstance(memoryRequest, workerCores)
 
     val prioritySetting = Records.newRecord(classOf[Priority])
     prioritySetting.setPriority(priority)
 
     val requests = new ArrayBuffer[ContainerRequest]()
     for (i <- 0 until numWorkers) {
-      requests += new ContainerRequest(memoryResource, hosts, racks, prioritySetting)
+      requests += new ContainerRequest(resource, hosts, racks, prioritySetting)
     }
     requests
   }