diff --git a/.gitignore b/.gitignore index 39635d7eefbe7..3d178992123da 100644 --- a/.gitignore +++ b/.gitignore @@ -44,4 +44,4 @@ derby.log dist/ spark-*-bin.tar.gz unit-tests.log -lib/ +/lib/ diff --git a/CHANGES.txt b/CHANGES.txt new file mode 100644 index 0000000000000..8c78d55ccd862 --- /dev/null +++ b/CHANGES.txt @@ -0,0 +1,3047 @@ +Spark Change Log + +Release 0.9.0-incubating + + d0a105d Thu Jan 23 20:53:31 2014 -0800 + Merge pull request #505 from JoshRosen/SPARK-1026 + [Deprecate mapPartitionsWithSplit in PySpark (SPARK-1026)] + + e66d4c2 Thu Jan 23 19:47:16 2014 -0800 + Merge pull request #503 from pwendell/master + [Fix bug on read-side of external sort when using Snappy.] + + e8d3f2b Thu Jan 23 19:20:22 2014 -0800 + Merge pull request #502 from pwendell/clone-1 + [Remove Hadoop object cloning and warn users making Hadoop RDD's.] + + 7a62353 Thu Jan 23 19:09:25 2014 -0800 + Merge pull request #501 from JoshRosen/cartesian-rdd-fixes + [Fix two bugs in PySpark cartesian(): SPARK-978 and SPARK-1034] + + 51960b8 Wed Jan 22 19:37:50 2014 -0800 + Merge pull request #496 from pwendell/master + [Fix bug in worker clean-up in UI] + + 828f7b4 Wed Jan 22 15:45:18 2014 -0800 + Merge pull request #495 from srowen/GraphXCommonsMathDependency + [Fix graphx Commons Math dependency] + + dc5857a Wed Jan 22 14:33:25 2014 -0800 + Merge pull request #492 from skicavs/master + [fixed job name and usage information for the JavaSparkPi example] + + dd533c9 Wed Jan 22 14:15:58 2014 -0800 + Merge pull request #478 from sryza/sandy-spark-1033 + [SPARK-1033. Ask for cores in Yarn container requests] + + b6fd3cd Tue Jan 21 00:12:01 2014 -0800 + Merge pull request #480 from pwendell/0.9-fixes + [Handful of 0.9 fixes] + + e5f8917 Mon Jan 20 23:35:07 2014 -0800 + Merge pull request #484 from tdas/run-example-fix + [Made run-example respect SPARK_JAVA_OPTS and SPARK_MEM.] + + 410ba06 Mon Jan 20 22:26:14 2014 -0800 + Merge pull request #482 from tdas/streaming-example-fix + [Added StreamingContext.awaitTermination to streaming examples] + + f137947 Mon Jan 20 22:24:07 2014 -0800 + Merge pull request #483 from pwendell/gitignore + [Restricting /lib to top level directory in .gitignore] + + 94ae25d Sun Jan 19 11:33:51 2014 -0800 + Merge pull request #470 from tgravescs/fix_spark_examples_yarn + [Only log error on missing jar to allow spark examples to jar.] + + 0f077b5 Sun Jan 19 10:30:29 2014 -0800 + Merge pull request #458 from tdas/docs-update + [Updated java API docs for streaming, along with very minor changes in the code examples.] + + 03019d1 Sat Jan 18 16:29:43 2014 -0800 + Merge pull request #459 from srowen/UpdaterL2Regularization + [Correct L2 regularized weight update with canonical form] + + 76147a2 Sat Jan 18 16:24:16 2014 -0800 + Merge pull request #437 from mridulm/master + [Minor api usability changes] + + 4ac8cab Sat Jan 18 16:22:46 2014 -0800 + Merge pull request #426 from mateiz/py-ml-tests + [Re-enable Python MLlib tests (require Python 2.7 and NumPy 1.7+)] + + 34e911c Sat Jan 18 16:17:34 2014 -0800 + Merge pull request #462 from mateiz/conf-file-fix + [Remove Typesafe Config usage and conf files to fix nested property names] + + ff7201c Sat Jan 18 12:50:02 2014 -0800 + Merge pull request #461 from pwendell/master + [Use renamed shuffle spill config in CoGroupedRDD.scala] + + 7b0d5a5 Thu Jan 16 23:18:48 2014 -0800 + Merge pull request #451 from Qiuzhuang/master + [Fixed Window spark shell launch script error.] + + 4ccedb3 Wed Jan 15 14:26:48 2014 -0800 + Merge pull request #444 from mateiz/py-version + [Clarify that Python 2.7 is only needed for MLlib] + + e3fa36f Wed Jan 15 13:56:04 2014 -0800 + Merge pull request #442 from pwendell/standalone + [Workers should use working directory as spark home if it's not specified] + + 29c76d9 Wed Jan 15 13:55:48 2014 -0800 + Merge pull request #443 from tdas/filestream-fix + [Made some classes private[stremaing] and deprecated a method in JavaStreamingContext.] + + aca40aa Wed Jan 15 11:15:47 2014 -0800 + Merge pull request #441 from pwendell/graphx-build + [GraphX shouldn't list Spark as provided.] + + e12c374 Wed Jan 15 10:01:43 2014 -0800 + Merge pull request #433 from markhamstra/debFix + [Updated Debian packaging] + + 2f015c2 Tue Jan 14 23:17:28 2014 -0800 + Merge pull request #436 from ankurdave/VertexId-case + [Rename VertexID -> VertexId in GraphX] + + 2859cab Tue Jan 14 23:08:19 2014 -0800 + Merge pull request #435 from tdas/filestream-fix + [Fixed the flaky tests by making SparkConf not serializable] + + fbfbb33 Tue Jan 14 23:06:29 2014 -0800 + Merge pull request #434 from rxin/graphxmaven + [Fixed SVDPlusPlusSuite in Maven build.] + + 2c6c07f Tue Jan 14 21:53:05 2014 -0800 + Merge pull request #424 from jegonzal/GraphXProgrammingGuide + [Additional edits for clarity in the graphx programming guide.] + + 6fa4e02 Tue Jan 14 21:51:25 2014 -0800 + Merge pull request #431 from ankurdave/graphx-caching-doc + [Describe caching and uncaching in GraphX programming guide] + + 2f930d5 Tue Jan 14 15:00:11 2014 -0800 + Merge pull request #428 from pwendell/writeable-objects + [Don't clone records for text files] + + 329c9df Tue Jan 14 14:53:36 2014 -0800 + Merge pull request #429 from ankurdave/graphx-examples-pom.xml + [Add GraphX dependency to examples/pom.xml] + + a14933d Tue Jan 14 14:52:42 2014 -0800 + Merge pull request #427 from pwendell/deprecate-aggregator + [Deprecate rather than remove old combineValuesByKey function] + + 119b6c5 Tue Jan 14 13:29:08 2014 -0800 + Merge pull request #425 from rxin/scaladoc + [API doc update & make Broadcast public] + + bf3b150 Tue Jan 14 09:45:22 2014 -0800 + Merge pull request #423 from jegonzal/GraphXProgrammingGuide + [Improving the graphx-programming-guide] + + 1b4adc2 Tue Jan 14 01:19:24 2014 -0800 + Merge pull request #420 from pwendell/header-files + [Add missing header files] + + b60840e Tue Jan 14 00:48:34 2014 -0800 + Merge pull request #418 from pwendell/0.9-versions + [Version changes for release 0.9.0.] + + 980250b Tue Jan 14 00:05:37 2014 -0800 + Merge pull request #416 from tdas/filestream-fix + [Removed unnecessary DStream operations and updated docs] + + 055be5c Mon Jan 13 23:26:44 2014 -0800 + Merge pull request #415 from pwendell/shuffle-compress + [Enable compression by default for spills] + + fdaabdc Mon Jan 13 23:08:26 2014 -0800 + Merge pull request #380 from mateiz/py-bayes + [Add Naive Bayes to Python MLlib, and some API fixes] + + 4a805af Mon Jan 13 22:58:38 2014 -0800 + Merge pull request #367 from ankurdave/graphx + [GraphX: Unifying Graphs and Tables] + + 945fe7a Mon Jan 13 22:56:12 2014 -0800 + Merge pull request #408 from pwendell/external-serializers + [Improvements to external sorting] + + 68641bc Mon Jan 13 22:54:13 2014 -0800 + Merge pull request #413 from rxin/scaladoc + [Adjusted visibility of various components and documentation for 0.9.0 release.] + + 0ca0d4d Mon Jan 13 22:32:21 2014 -0800 + Merge pull request #401 from andrewor14/master + [External sorting - Add number of bytes spilled to Web UI] + + 08b9fec Mon Jan 13 22:29:03 2014 -0800 + Merge pull request #409 from tdas/unpersist + [Automatically unpersisting RDDs that have been cleaned up from DStreams] + + b07bc02 Mon Jan 13 20:45:22 2014 -0800 + Merge pull request #412 from harveyfeng/master + [Add default value for HadoopRDD's `cloneRecords` constructor arg] + + a2fee38 Mon Jan 13 19:45:26 2014 -0800 + Merge pull request #411 from tdas/filestream-fix + [Improved logic of finding new files in FileInputDStream] + + 01c0d72 Mon Jan 13 16:24:30 2014 -0800 + Merge pull request #410 from rxin/scaladoc1 + [Updated JavaStreamingContext to make scaladoc compile.] + + 8038da2 Mon Jan 13 14:59:30 2014 -0800 + Merge pull request #2 from jegonzal/GraphXCCIssue + [Improving documentation and identifying potential bug in CC calculation.] + + b93f9d4 Mon Jan 13 12:18:05 2014 -0800 + Merge pull request #400 from tdas/dstream-move + [Moved DStream and PairDSream to org.apache.spark.streaming.dstream] + + e6ed13f Sun Jan 12 22:35:14 2014 -0800 + Merge pull request #397 from pwendell/host-port + [Remove now un-needed hostPort option] + + 0b96d85 Sun Jan 12 21:31:43 2014 -0800 + Merge pull request #399 from pwendell/consolidate-off + [Disable shuffle file consolidation by default] + + 0ab505a Sun Jan 12 21:31:04 2014 -0800 + Merge pull request #395 from hsaputra/remove_simpleredundantreturn_scala + [Remove simple redundant return statements for Scala methods/functions] + + 405bfe8 Sun Jan 12 20:04:21 2014 -0800 + Merge pull request #394 from tdas/error-handling + [Better error handling in Spark Streaming and more API cleanup] + + 28a6b0c Sun Jan 12 19:49:36 2014 -0800 + Merge pull request #398 from pwendell/streaming-api + [Rename DStream.foreach to DStream.foreachRDD] + + 074f502 Sun Jan 12 17:01:13 2014 -0800 + Merge pull request #396 from pwendell/executor-env + [Setting load defaults to true in executor] + + 82e2b92 Sun Jan 12 16:55:11 2014 -0800 + Merge pull request #392 from rxin/listenerbus + [Stop SparkListenerBus daemon thread when DAGScheduler is stopped.] + + 288a878 Sat Jan 11 21:53:19 2014 -0800 + Merge pull request #389 from rxin/clone-writables + [Minor update for clone writables and more documentation.] + + dbc11df Sat Jan 11 18:07:13 2014 -0800 + Merge pull request #388 from pwendell/master + [Fix UI bug introduced in #244.] + + 409866b Sat Jan 11 17:12:06 2014 -0800 + Merge pull request #393 from pwendell/revert-381 + [Revert PR 381] + + 6510f04 Sat Jan 11 12:48:26 2014 -0800 + Merge pull request #387 from jerryshao/conf-fix + [Fix configure didn't work small problem in ALS] + + ee6e7f9 Sat Jan 11 12:07:55 2014 -0800 + Merge pull request #359 from ScrapCodes/clone-writables + [We clone hadoop key and values by default and reuse objects if asked to.] + + 4216178 Sat Jan 11 09:46:48 2014 -0800 + Merge pull request #373 from jerryshao/kafka-upgrade + [Upgrade Kafka dependecy to 0.8.0 release version] + + 92ad18b Fri Jan 10 23:25:15 2014 -0800 + Merge pull request #376 from prabeesh/master + [Change clientId to random clientId] + + 0b5ce7a Fri Jan 10 23:23:21 2014 -0800 + Merge pull request #386 from pwendell/typo-fix + [Small typo fix] + + 1d7bef0 Fri Jan 10 18:53:03 2014 -0800 + Merge pull request #381 from mateiz/default-ttl + [Fix default TTL for metadata cleaner] + + 44d6a8e Fri Jan 10 17:51:50 2014 -0800 + Merge pull request #382 from RongGu/master + [Fix a type error in comment lines] + + 88faa30 Fri Jan 10 17:14:22 2014 -0800 + Merge pull request #385 from shivaram/add-i2-instances + [Add i2 instance types to Spark EC2.] + + f265531 Fri Jan 10 16:25:44 2014 -0800 + Merge pull request #383 from tdas/driver-test + [API for automatic driver recovery for streaming programs and other bug fixes] + + d37408f Fri Jan 10 16:25:01 2014 -0800 + Merge pull request #377 from andrewor14/master + [External Sorting for Aggregator and CoGroupedRDDs (Revisited)] + + 0eaf01c Fri Jan 10 15:32:19 2014 -0800 + Merge pull request #369 from pillis/master + [SPARK-961 Add a Vector.random() method] + + 7cef843 Fri Jan 10 15:34:15 2014 -0600 + Merge pull request #371 from tgravescs/yarn_client_addjar_misc_fixes + [Yarn client addjar and misc fixes] + + 7b58f11 Fri Jan 10 12:47:46 2014 -0800 + Merge pull request #384 from pwendell/debug-logs + [Make DEBUG-level logs consummable.] + + 23d2995 Fri Jan 10 10:20:02 2014 -0800 + Merge pull request #1 from jegonzal/graphx + [ProgrammingGuide] + + 0ebc973 Thu Jan 9 23:58:49 2014 -0800 + Merge pull request #375 from mateiz/option-fix + [Fix bug added when we changed AppDescription.maxCores to an Option] + + dd03cea Thu Jan 9 23:38:03 2014 -0800 + Merge pull request #378 from pwendell/consolidate_on + [Enable shuffle consolidation by default.] + + 997c830 Thu Jan 9 22:22:20 2014 -0800 + Merge pull request #363 from pwendell/streaming-logs + [Set default logging to WARN for Spark streaming examples.] + + 300eaa9 Thu Jan 9 20:29:51 2014 -0800 + Merge pull request #353 from pwendell/ipython-simplify + [Simplify and fix pyspark script.] + + 4b074fa Thu Jan 9 19:03:55 2014 -0800 + Merge pull request #374 from mateiz/completeness + [Add some missing Java API methods] + + a9d5333 Thu Jan 9 18:46:46 2014 -0800 + Merge pull request #294 from RongGu/master + [Bug fixes for updating the RDD block's memory and disk usage information] + + d86a85e Thu Jan 9 18:37:52 2014 -0800 + Merge pull request #293 from pwendell/standalone-driver + [SPARK-998: Support Launching Driver Inside of Standalone Mode] + + 26cdb5f Thu Jan 9 17:16:34 2014 -0800 + Merge pull request #372 from pwendell/log4j-fix-1 + [Send logs to stderr by default (instead of stdout).] + + 12f414e Thu Jan 9 15:31:30 2014 -0800 + Merge pull request #362 from mateiz/conf-getters + [Use typed getters for configuration settings] + + 365cac9 Thu Jan 9 00:56:16 2014 -0800 + Merge pull request #361 from rxin/clean + [Minor style cleanup. Mostly on indenting & line width changes.] + + 73c724e Thu Jan 9 00:32:19 2014 -0800 + Merge pull request #368 from pwendell/sbt-fix + [Don't delegate to users `sbt`.] + + dceedb4 Wed Jan 8 23:19:28 2014 -0800 + Merge pull request #364 from pwendell/fix + [Fixing config option "retained_stages" => "retainedStages".] + + 04d83fc Wed Jan 8 11:55:37 2014 -0800 + Merge pull request #360 from witgo/master + [fix make-distribution.sh show version: command not found] + + 56ebfea Wed Jan 8 11:50:06 2014 -0800 + Merge pull request #357 from hsaputra/set_boolean_paramname + [Set boolean param name for call to SparkHadoopMapReduceUtil.newTaskAttemptID] + + bdeaeaf Wed Jan 8 11:48:39 2014 -0800 + Merge pull request #358 from pwendell/add-cdh + [Add CDH Repository to Maven Build] + + 5cae05f Wed Jan 8 11:47:28 2014 -0800 + Merge pull request #356 from hsaputra/remove_deprecated_cleanup_method + [Remove calls to deprecated mapred's OutputCommitter.cleanupJob] + + 6eef78d Wed Jan 8 08:49:20 2014 -0600 + Merge pull request #345 from colorant/yarn + [support distributing extra files to worker for yarn client mode] + + bb6a39a Tue Jan 7 22:32:18 2014 -0800 + Merge pull request #322 from falaki/MLLibDocumentationImprovement + [SPARK-1009 Updated MLlib docs to show how to use it in Python] + + cb1b927 Tue Jan 7 22:26:28 2014 -0800 + Merge pull request #355 from ScrapCodes/patch-1 + [Update README.md] + + c0f0155 Tue Jan 7 22:21:52 2014 -0800 + Merge pull request #313 from tdas/project-refactor + [Refactored the streaming project to separate external libraries like Twitter, Kafka, Flume, etc.] + + f5f12dc Tue Jan 7 21:56:35 2014 -0800 + Merge pull request #336 from liancheng/akka-remote-lookup + [Get rid of `Either[ActorRef, ActorSelection]'] + + 11891e6 Wed Jan 8 00:32:18 2014 -0500 + Merge pull request #327 from lucarosellini/master + [Added ā€˜-iā€™ command line option to Spark REPL] + + 7d0aac9 Wed Jan 8 00:30:45 2014 -0500 + Merge pull request #354 from hsaputra/addasfheadertosbt + [Add ASF header to the new sbt script.] + + d75dc42 Wed Jan 8 00:30:03 2014 -0500 + Merge pull request #350 from mateiz/standalone-limit + [Add way to limit default # of cores used by apps in standalone mode] + + 61674bc Tue Jan 7 18:32:13 2014 -0800 + Merge pull request #352 from markhamstra/oldArch + [Don't leave os.arch unset after BlockManagerSuite] + + b2e690f Tue Jan 7 16:57:08 2014 -0800 + Merge pull request #328 from falaki/MatrixFactorizationModel-fix + [SPARK-1012: DAGScheduler Exception Fix] + + 6ccf8ce Tue Jan 7 15:49:14 2014 -0800 + Merge pull request #351 from pwendell/maven-fix + [Add log4j exclusion rule to maven.] + + 7d5fa17 Tue Jan 7 11:31:34 2014 -0800 + Merge pull request #337 from yinxusen/mllib-16-bugfix + [Mllib 16 bugfix] + + 71fc113 Tue Jan 7 11:30:35 2014 -0800 + Merge pull request #349 from CodingCat/support-worker_dir + [add the comments about SPARK_WORKER_DIR] + + 15d9534 Tue Jan 7 08:10:02 2014 -0800 + Merge pull request #318 from srowen/master + [Suggested small changes to Java code for slightly more standard style, encapsulation and in some cases performance] + + 468af0f Tue Jan 7 08:09:01 2014 -0800 + Merge pull request #348 from prabeesh/master + [spark -> org.apache.spark] + + c3cf047 Tue Jan 7 00:54:25 2014 -0800 + Merge pull request #339 from ScrapCodes/conf-improvements + [Conf improvements] + + a862caf Tue Jan 7 00:18:20 2014 -0800 + Merge pull request #331 from holdenk/master + [Add a script to download sbt if not present on the system] + + b97ef21 Mon Jan 6 20:12:57 2014 -0800 + Merge pull request #346 from sproblvem/patch-1 + [Update stop-slaves.sh] + + 7210257 Mon Jan 6 18:25:44 2014 -0800 + Merge pull request #128 from adamnovak/master + [Fix failing "sbt/sbt publish-local" by adding a no-argument PrimitiveKeyOpenHashMap constructor ] + + e4d6057 Mon Jan 6 14:56:54 2014 -0800 + Merge pull request #343 from pwendell/build-fix + [Fix test breaking downstream builds] + + 93bf962 Mon Jan 6 11:42:41 2014 -0800 + Merge pull request #340 from ScrapCodes/sbt-fixes + [Made java options to be applied during tests so that they become self explanatory.] + + 60edeb3 Mon Jan 6 11:40:32 2014 -0800 + Merge pull request #338 from ScrapCodes/ning-upgrade + [SPARK-1005 Ning upgrade] + + c708e81 Mon Jan 6 11:35:48 2014 -0800 + Merge pull request #341 from ash211/patch-5 + [Clarify spark.cores.max in docs] + + 33fcb91 Mon Jan 6 11:19:23 2014 -0800 + Merge pull request #342 from tgravescs/fix_maven_protobuf + [Change protobuf version for yarn alpha back to 2.4.1] + + 357083c Mon Jan 6 10:29:04 2014 -0800 + Merge pull request #330 from tgravescs/fix_addjars_null_handling + [Fix handling of empty SPARK_EXAMPLES_JAR] + + a2e7e04 Sun Jan 5 22:37:36 2014 -0800 + Merge pull request #333 from pwendell/logging-silence + [Quiet ERROR-level Akka Logs] + + 5b0986a Sun Jan 5 19:25:09 2014 -0800 + Merge pull request #334 from pwendell/examples-fix + [Removing SPARK_EXAMPLES_JAR in the code] + + f4b924f Sun Jan 5 17:11:47 2014 -0800 + Merge pull request #335 from rxin/ser + [Fall back to zero-arg constructor for Serializer initialization if there is no constructor that accepts SparkConf.] + + d43ad3e Sat Jan 4 16:29:30 2014 -0800 + Merge pull request #292 from soulmachine/naive-bayes + [standard Naive Bayes classifier] + + 86404da Sat Jan 4 14:55:54 2014 -0800 + Merge pull request #127 from jegonzal/MapByPartition + [Adding mapEdges and mapTriplets by Partition] + + e68cdb1 Sat Jan 4 13:46:02 2014 -0800 + Merge pull request #124 from jianpingjwang/master + [refactor and bug fix] + + 280ddf6 Sat Jan 4 12:54:41 2014 -0800 + Merge pull request #121 from ankurdave/more-simplify + [Simplify GraphImpl internals further] + + 10fe23b Fri Jan 3 23:50:14 2014 -0800 + Merge pull request #329 from pwendell/remove-binaries + [SPARK-1002: Remove Binaries from Spark Source] + + c4d6145 Fri Jan 3 16:30:53 2014 -0800 + Merge pull request #325 from witgo/master + [Modify spark on yarn to create SparkConf process] + + 4ae101f Fri Jan 3 11:24:35 2014 -0800 + Merge pull request #317 from ScrapCodes/spark-915-segregate-scripts + [Spark-915 segregate scripts] + + 87248bd Fri Jan 3 00:45:31 2014 -0800 + Merge pull request #1 from apache/master + [Merge latest Spark changes] + + 30b9db0 Thu Jan 2 23:15:55 2014 -0800 + Merge pull request #285 from colorant/yarn-refactor + [Yarn refactor] + + 498a5f0 Thu Jan 2 19:06:40 2014 -0800 + Merge pull request #323 from tgravescs/sparkconf_yarn_fix + [fix spark on yarn after the sparkConf changes] + + 0475ca8 Thu Jan 2 15:17:08 2014 -0800 + Merge pull request #320 from kayousterhout/erroneous_failed_msg + [Remove erroneous FAILED state for killed tasks.] + + 588a169 Thu Jan 2 13:20:54 2014 -0800 + Merge pull request #297 from tdas/window-improvement + [Improvements to DStream window ops and refactoring of Spark's CheckpointSuite] + + 5e67cdc Thu Jan 2 12:56:28 2014 -0800 + Merge pull request #319 from kayousterhout/remove_error_method + [Removed redundant TaskSetManager.error() function.] + + ca67909 Thu Jan 2 15:54:54 2014 -0500 + Merge pull request #311 from tmyklebu/master + [SPARK-991: Report information gleaned from a Python stacktrace in the UI] + + 3713f81 Wed Jan 1 21:29:12 2014 -0800 + Merge pull request #309 from mateiz/conf2 + [SPARK-544. Migrate configuration to a SparkConf class] + + c1d928a Wed Jan 1 17:03:48 2014 -0800 + Merge pull request #312 from pwendell/log4j-fix-2 + [SPARK-1008: Logging improvments] + + dc9cb83 Wed Jan 1 13:28:34 2014 -0800 + Merge pull request #126 from jegonzal/FixingPersist + [Fixing Persist Behavior] + + 9a0ff72 Tue Dec 31 21:50:24 2013 -0800 + Merge pull request #314 from witgo/master + [restore core/pom.xml file modification] + + 8b8e70e Tue Dec 31 17:48:24 2013 -0800 + Merge pull request #73 from falaki/ApproximateDistinctCount + [Approximate distinct count] + + 63b411d Tue Dec 31 14:31:28 2013 -0800 + Merge pull request #238 from ngbinh/upgradeNetty + [upgrade Netty from 4.0.0.Beta2 to 4.0.13.Final] + + 32d6ae9 Tue Dec 31 13:51:07 2013 -0800 + Merge pull request #120 from ankurdave/subgraph-reuses-view + [Reuse VTableReplicated in GraphImpl.subgraph] + + 55b7e2f Tue Dec 31 10:12:51 2013 -0800 + Merge pull request #289 from tdas/filestream-fix + [Bug fixes for file input stream and checkpointing] + + 2b71ab9 Mon Dec 30 11:01:30 2013 -0800 + Merge pull request from aarondav: Utilize DiskBlockManager pathway for temp file writing + [This gives us a couple advantages:] + + 50e3b8e Mon Dec 30 07:44:26 2013 -0800 + Merge pull request #308 from kayousterhout/stage_naming + [Changed naming of StageCompleted event to be consistent] + + 72a17b6 Sat Dec 28 21:25:40 2013 -1000 + Revert "Merge pull request #310 from jyunfan/master" + [This reverts commit 79b20e4dbe3dcd8559ec8316784d3334bb55868b, reversing] + + 79b20e4 Sat Dec 28 21:13:36 2013 -1000 + Merge pull request #310 from jyunfan/master + [Fix typo in the Accumulators section] + + 7375047 Sat Dec 28 13:25:06 2013 -0800 + Merge pull request #304 from kayousterhout/remove_unused + [Removed unused failed and causeOfFailure variables (in TaskSetManager)] + + ad3dfd1 Fri Dec 27 22:10:14 2013 -0500 + Merge pull request #307 from kayousterhout/other_failure + [Removed unused OtherFailure TaskEndReason.] + + b579b83 Fri Dec 27 22:09:04 2013 -0500 + Merge pull request #306 from kayousterhout/remove_pending + [Remove unused hasPendingTasks methods] + + 19672dc Fri Dec 27 13:37:10 2013 -0800 + Merge pull request #305 from kayousterhout/line_spacing + [Fixed >100char lines in DAGScheduler.scala] + + 7be1e57 Thu Dec 26 23:41:40 2013 -1000 + Merge pull request #298 from aarondav/minor + [Minor: Decrease margin of left side of Log page] + + 7d811ba Thu Dec 26 23:39:58 2013 -1000 + Merge pull request #302 from pwendell/SPARK-1007 + [SPARK-1007: spark-class2.cmd should change SCALA_VERSION to be 2.10] + + 5e69fc5 Thu Dec 26 19:10:39 2013 -0500 + Merge pull request #295 from markhamstra/JobProgressListenerNPE + [Avoid a lump of coal (NPE) in JobProgressListener's stocking.] + + da20270 Thu Dec 26 12:11:52 2013 -0800 + Merge pull request #1 from aarondav/driver + [Refactor DriverClient to be more Actor-based] + + e240bad Thu Dec 26 12:30:48 2013 -0500 + Merge pull request #296 from witgo/master + [Renamed ClusterScheduler to TaskSchedulerImpl for yarn and new-yarn package] + + c344ed0 Thu Dec 26 01:31:06 2013 -0500 + Merge pull request #283 from tmyklebu/master + [Python bindings for mllib] + + 56094bc Wed Dec 25 13:14:33 2013 -0500 + Merge pull request #290 from ash211/patch-3 + [Typo: avaiable -> available] + + 4842a07 Wed Dec 25 01:52:15 2013 -0800 + Merge pull request #287 from azuryyu/master + [Fixed job name in the java streaming example.] + + 85a344b Tue Dec 24 16:35:06 2013 -0800 + Merge pull request #127 from kayousterhout/consolidate_schedulers + [Deduplicate Local and Cluster schedulers.] + + c2dd6bc Tue Dec 24 14:36:47 2013 -0800 + Merge pull request #279 from aarondav/shuffle-cleanup0 + [Clean up shuffle files once their metadata is gone] + + 3bf7c70 Tue Dec 24 16:37:13 2013 -0500 + Merge pull request #275 from ueshin/wip/changeclasspathorder + [Change the order of CLASSPATH.] + + d63856c Mon Dec 23 22:07:26 2013 -0800 + Merge pull request #286 from rxin/build + [Show full stack trace and time taken in unit tests.] + + 23a9ae6 Tue Dec 24 00:08:48 2013 -0500 + Merge pull request #277 from tdas/scheduler-update + [Refactored the streaming scheduler and added StreamingListener interface] + + 11107c9 Mon Dec 23 10:38:20 2013 -0800 + Merge pull request #244 from leftnoteasy/master + [Added SPARK-968 implementation for review] + + 44e4205 Sun Dec 22 11:44:18 2013 -0800 + Merge pull request #116 from jianpingjwang/master + [remove unused variables and fix a bug] + + 4797c22 Fri Dec 20 13:30:39 2013 -0800 + Merge pull request #118 from ankurdave/VertexPartitionSuite + [Test VertexPartition and fix bugs] + + 0bc57c5 Fri Dec 20 11:56:54 2013 -0800 + Merge pull request #280 from aarondav/minor + [Minor cleanup for standalone scheduler] + + ac70b8f Fri Dec 20 10:56:10 2013 -0800 + Merge pull request #117 from ankurdave/more-tests + [More tests] + + 45310d4 Thu Dec 19 22:08:20 2013 -0800 + Merge pull request #115 from ankurdave/test-reorg + [Reorganize unit tests; improve GraphSuite test coverage] + + 9228ec8 Thu Dec 19 21:37:15 2013 -0800 + Merge pull request #1 from aarondav/127 + [Merge master into 127] + + eca68d4 Thu Dec 19 18:12:22 2013 -0800 + Merge pull request #272 from tmyklebu/master + [Track and report task result serialisation time.] + + 7990c56 Thu Dec 19 13:35:09 2013 -0800 + Merge pull request #276 from shivaram/collectPartition + [Add collectPartition to JavaRDD interface.] + + 440e531 Thu Dec 19 10:38:56 2013 -0800 + Merge pull request #278 from MLnick/java-python-tostring + [Add toString to Java RDD, and __repr__ to Python RDD] + + d8d3f3e Thu Dec 19 00:06:43 2013 -0800 + Merge pull request #183 from aarondav/spark-959 + [[SPARK-959] Explicitly depend on org.eclipse.jetty.orbit jar] + + bfba532 Wed Dec 18 22:22:21 2013 -0800 + Merge pull request #247 from aarondav/minor + [Increase spark.akka.askTimeout default to 30 seconds] + + da301b5 Wed Dec 18 20:03:29 2013 -0800 + Merge pull request #112 from amatsukawa/scc + [Strongly connected component algorithm] + + c64a53a Wed Dec 18 16:56:26 2013 -0800 + Merge pull request #267 from JoshRosen/cygwin + [Fix Cygwin support in several scripts.] + + a645ef6 Wed Dec 18 16:07:52 2013 -0800 + Merge pull request #48 from amatsukawa/add_project_to_graph + [Add mask operation on graph and filter graph primitive] + + d7ebff0 Wed Dec 18 15:38:48 2013 -0800 + Merge pull request #1 from ankurdave/add_project_to_graph + [Merge current master and reimplement Graph.mask using innerJoin] + + 5ea1872 Wed Dec 18 15:27:24 2013 -0800 + Merge pull request #274 from azuryy/master + [Fixed the example link in the Scala programing guid.] + + 3fd2e09 Wed Dec 18 12:52:36 2013 -0800 + Merge pull request #104 from jianpingjwang/master + [SVD++ demo] + + f4effb3 Tue Dec 17 22:26:21 2013 -0800 + Merge pull request #273 from rxin/top + [Fixed a performance problem in RDD.top and BoundedPriorityQueue] + + 1b5eacb Tue Dec 17 13:49:17 2013 -0800 + Merge pull request #102 from ankurdave/clustered-edge-index + [Add clustered index on edges by source vertex] + + 7a8169b Mon Dec 16 22:42:21 2013 -0800 + Merge pull request #268 from pwendell/shaded-protobuf + [Add support for 2.2. to master (via shaded jars)] + + 0476c84 Mon Dec 16 17:19:25 2013 -0800 + Merge pull request #100 from ankurdave/mrTriplets-active-set + [Support activeSet option in mapReduceTriplets] + + 964a3b6 Mon Dec 16 15:23:51 2013 -0800 + Merge pull request #270 from ewencp/really-force-ssh-pseudo-tty-master + [Force pseudo-tty allocation in spark-ec2 script.] + + 5192ef3 Mon Dec 16 15:08:08 2013 -0800 + Merge pull request #94 from ankurdave/load-edges-columnar + [Load edges in columnar format] + + 883e034 Mon Dec 16 14:16:02 2013 -0800 + Merge pull request #245 from gregakespret/task-maxfailures-fix + [Fix for spark.task.maxFailures not enforced correctly.] + + a51f340 Sun Dec 15 22:02:30 2013 -0800 + Merge pull request #265 from markhamstra/scala.binary.version + [DRY out the POMs with scala.binary.version] + + ded10ce Sun Dec 15 17:25:33 2013 -0800 + Merge pull request #103 from amplab/optimizations + [Optimizations cherry-picked from SIGMOD branches] + + d2ced6d Sun Dec 15 14:11:34 2013 -0800 + Merge pull request #256 from MLnick/master + [Fix 'IPYTHON=1 ./pyspark' throwing ValueError] + + c55e698 Sun Dec 15 12:49:02 2013 -0800 + Merge pull request #257 from tgravescs/sparkYarnFixName + [Fix the --name option for Spark on Yarn] + + ab85f88 Sun Dec 15 12:48:32 2013 -0800 + Merge pull request #264 from shivaram/spark-class-fix + [Use CoarseGrainedExecutorBackend in spark-class] + + 8a56c1f Sat Dec 14 16:29:24 2013 -0800 + Merge pull request #84 from amatsukawa/graphlab_enhancements + [GraphLab bug fix & set start vertex] + + 7db9165 Sat Dec 14 14:16:34 2013 -0800 + Merge pull request #251 from pwendell/master + [Fix list rendering in YARN markdown docs.] + + 2fd781d Sat Dec 14 12:59:37 2013 -0800 + Merge pull request #249 from ngbinh/partitionInJavaSortByKey + [Expose numPartitions parameter in JavaPairRDD.sortByKey()] + + 9bf192b Sat Dec 14 12:52:18 2013 -0800 + Merge pull request #91 from amplab/standalone-pagerank + [Standalone PageRank] + + 840af5e Sat Dec 14 12:51:51 2013 -0800 + Merge pull request #99 from ankurdave/only-dynamic-pregel + [Remove static Pregel; take maxIterations in dynamic Pregel] + + 97ac060 Sat Dec 14 00:22:45 2013 -0800 + Merge pull request #259 from pwendell/scala-2.10 + [Migration to Scala 2.10] + + 7ac944f Fri Dec 13 23:22:08 2013 -0800 + Merge pull request #262 from pwendell/mvn-fix + [Fix maven build issues in 2.10 branch] + + 6defb06 Fri Dec 13 21:18:57 2013 -0800 + Merge pull request #261 from ScrapCodes/scala-2.10 + [Added a comment about ActorRef and ActorSelection difference.] + + 76566b1 Fri Dec 13 10:11:02 2013 -0800 + Merge pull request #260 from ScrapCodes/scala-2.10 + [Review comments on the PR for scala 2.10 migration.] + + 0aeb182 Thu Dec 12 21:14:42 2013 -0800 + Merge pull request #255 from ScrapCodes/scala-2.10 + [Disabled yarn 2.2 in sbt and mvn build and added a message in the sbt build.] + + 2e89398 Wed Dec 11 23:10:53 2013 -0800 + Merge pull request #254 from ScrapCodes/scala-2.10 + [Scala 2.10 migration] + + ce6ca4e Wed Dec 11 22:30:54 2013 -0800 + Merge pull request #97 from dcrankshaw/fix-rddtop + [Added BoundedPriorityQueue kryo registrator. Fixes top issue.] + + d2efe13 Tue Dec 10 13:01:26 2013 -0800 + Merge pull request #250 from pwendell/master + [README incorrectly suggests build sources spark-env.sh] + + 6169fe1 Mon Dec 9 16:51:36 2013 -0800 + Merge pull request #246 from pwendell/master + [Add missing license headers] + + d992ec6 Sun Dec 8 20:49:20 2013 -0800 + Merge pull request #195 from dhardy92/fix_DebScriptPackage + [[Deb] fix package of Spark classes adding org.apache prefix in scripts embeded in .deb] + + 1f4a4bc Sat Dec 7 22:34:34 2013 -0800 + Merge pull request #242 from pwendell/master + [Update broken links and add HDP 2.0 version string] + + 6494d62 Sat Dec 7 11:56:16 2013 -0800 + Merge pull request #240 from pwendell/master + [SPARK-917 Improve API links in nav bar] + + f466f79 Sat Dec 7 11:51:52 2013 -0800 + Merge pull request #239 from aarondav/nit + [Correct spellling error in configuration.md] + + 3abfbfb Sat Dec 7 11:24:19 2013 -0800 + Merge pull request #92 from ankurdave/rdd-names + [Set RDD names for easy debugging] + + 31e8a14 Fri Dec 6 21:49:55 2013 -0800 + Merge pull request #90 from amplab/pregel-replicate-changed + [Replicate only changed vertices] + + 10c3c0c Fri Dec 6 20:29:45 2013 -0800 + Merge pull request #237 from pwendell/formatting-fix + [Formatting fix] + + 1b38f5f Fri Dec 6 20:16:15 2013 -0800 + Merge pull request #236 from pwendell/shuffle-docs + [Adding disclaimer for shuffle file consolidation] + + e5d5728 Fri Dec 6 20:14:56 2013 -0800 + Merge pull request #235 from pwendell/master + [Minor doc fixes and updating README] + + 241336a Fri Dec 6 17:29:03 2013 -0800 + Merge pull request #234 from alig/master + [Updated documentation about the YARN v2.2 build process] + + e039234 Fri Dec 6 11:49:59 2013 -0800 + Merge pull request #190 from markhamstra/Stages4Jobs + [stageId <--> jobId mapping in DAGScheduler] + + bfa6860 Fri Dec 6 11:04:03 2013 -0800 + Merge pull request #233 from hsaputra/changecontexttobackend + [Change the name of input argument in ClusterScheduler#initialize from context to backend.] + + 3fb302c Fri Dec 6 11:03:32 2013 -0800 + Merge pull request #205 from kayousterhout/logging + [Added logging of scheduler delays to UI] + + 87676a6 Fri Dec 6 11:01:42 2013 -0800 + Merge pull request #220 from rxin/zippart + [Memoize preferred locations in ZippedPartitionsBaseRDD] + + 0780498 Thu Dec 5 23:29:42 2013 -0800 + Merge pull request #232 from markhamstra/FiniteWait + [jobWaiter.synchronized before jobWaiter.wait] + + 1c8500e Thu Dec 5 16:25:44 2013 -0800 + Merge pull request #88 from amplab/varenc + [Fixed a bug that variable encoding doesn't work for ints that use all 64 bits.] + + e0bcaa0 Thu Dec 5 12:37:02 2013 -0800 + Merge pull request #86 from ankurdave/vid-varenc + [Finish work on #85] + + 5d46025 Thu Dec 5 12:31:24 2013 -0800 + Merge pull request #228 from pwendell/master + [Document missing configs and set shuffle consolidation to false.] + + 3e96b9a Thu Dec 5 12:07:36 2013 -0800 + Merge pull request #85 from ankurdave/vid-varenc + [Always write Vids using variable encoding] + + 72b6961 Wed Dec 4 23:33:04 2013 -0800 + Merge pull request #199 from harveyfeng/yarn-2.2 + [Hadoop 2.2 migration] + + e0347ba Wed Dec 4 17:38:06 2013 -0800 + Merge pull request #83 from ankurdave/fix-tests + [Fix compile errors in GraphSuite and SerializerSuite] + + 182f9ba Wed Dec 4 15:52:07 2013 -0800 + Merge pull request #227 from pwendell/master + [Fix small bug in web UI and minor clean-up.] + + cbd3b75 Wed Dec 4 15:35:26 2013 -0800 + Merge pull request #81 from amplab/clean1 + [Codebase refactoring] + + b9e7609 Wed Dec 4 14:42:09 2013 -0800 + Merge pull request #225 from ash211/patch-3 + [Add missing space after "Serialized" in StorageLevel] + + 055462c Wed Dec 4 14:02:11 2013 -0800 + Merge pull request #226 from ash211/patch-4 + [Typo: applicaton] + + d6e5473 Wed Dec 4 10:28:50 2013 -0800 + Merge pull request #223 from rxin/transient + [Mark partitioner, name, and generator field in RDD as @transient.] + + 8a3475a Tue Dec 3 14:21:40 2013 -0800 + Merge pull request #218 from JoshRosen/spark-970-pyspark-unicode-error + [Fix UnicodeEncodeError in PySpark saveAsTextFile() (SPARK-970)] + + 46b87b8 Tue Dec 3 00:41:11 2013 -0800 + Merge pull request #2 from colorant/yarn-client-2.2 + [Fix pom.xml for maven build] + + 58d9bbc Mon Dec 2 21:58:53 2013 -0800 + Merge pull request #217 from aarondav/mesos-urls + [Re-enable zk:// urls for Mesos SparkContexts] + + 740922f Sun Dec 1 12:46:58 2013 -0800 + Merge pull request #219 from sundeepn/schedulerexception + [Scheduler quits when newStage fails] + + 60e23a5 Sat Nov 30 23:38:49 2013 -0800 + Merge pull request #216 from liancheng/fix-spark-966 + [Bugfix: SPARK-965 & SPARK-966] + + 34ee814 Sat Nov 30 15:10:30 2013 -0800 + Merged Ankur's pull request #80 and fixed subgraph. + [] + + 743a31a Wed Nov 27 18:24:39 2013 -0800 + Merge pull request #210 from haitaoyao/http-timeout + [add http timeout for httpbroadcast] + + 993e293 Wed Nov 27 00:57:54 2013 -0800 + Merge pull request #1 from colorant/yarn-client-2.2 + [Port yarn-client mode for new-yarn] + + fb6875d Tue Nov 26 20:55:40 2013 -0800 + Merge pull request #146 from JoshRosen/pyspark-custom-serializers + [Custom Serializers for PySpark] + + 330ada1 Tue Nov 26 19:08:33 2013 -0800 + Merge pull request #207 from henrydavidge/master + [Log a warning if a task's serialized size is very big] + + 615213f Tue Nov 26 19:07:20 2013 -0800 + Merge pull request #212 from markhamstra/SPARK-963 + [[SPARK-963] Fixed races in JobLoggerSuite] + + cb976df Tue Nov 26 10:23:19 2013 -0800 + Merge pull request #209 from pwendell/better-docs + [Improve docs for shuffle instrumentation] + + 18d6df0 Tue Nov 26 00:00:07 2013 -0800 + Merge pull request #86 from holdenk/master + [Add histogram functionality to DoubleRDDFunctions] + + 0e2109d Mon Nov 25 20:48:37 2013 -0800 + Merge pull request #204 from rxin/hash + [OpenHashSet fixes] + + c46067f Mon Nov 25 19:09:31 2013 -0800 + Merge pull request #206 from ash211/patch-2 + [Update tuning.md] + + 14bb465 Mon Nov 25 18:50:18 2013 -0800 + Merge pull request #201 from rxin/mappartitions + [Use the proper partition index in mapPartitionsWIthIndex] + + eb4296c Mon Nov 25 15:25:29 2013 -0800 + Merge pull request #101 from colorant/yarn-client-scheduler + [For SPARK-527, Support spark-shell when running on YARN] + + 466fd06 Mon Nov 25 18:27:26 2013 +0800 + Incorporated ideas from pull request #200. - Use Murmur Hash 3 finalization step to scramble the bits of HashCode instead of the simpler version in java.util.HashMap; the latter one had trouble with ranges of consecutive integers. Murmur Hash 3 is used by fastutil. + [- Don't check keys for equality when re-inserting due to growing the] + + 088995f Mon Nov 25 00:57:51 2013 -0800 + Merge pull request #77 from amplab/upgrade + [Sync with Spark master] + + 62889c4 Mon Nov 25 11:27:45 2013 +0800 + Merge pull request #203 from witgo/master + [ Fix Maven build for metrics-graphite] + + 6af03ed Sun Nov 24 16:42:37 2013 -0800 + Merge pull request #76 from dcrankshaw/fix_partitioners + [Actually use partitioner command line args in Analytics.] + + 859d62d Sun Nov 24 16:19:51 2013 -0800 + Merge pull request #151 from russellcardullo/add-graphite-sink + [Add graphite sink for metrics] + + 65de73c Sun Nov 24 15:52:33 2013 -0800 + Merge pull request #185 from mkolod/random-number-generator + [XORShift RNG with unit tests and benchmark] + + 972171b Mon Nov 25 07:50:46 2013 +0800 + Merge pull request #197 from aarondav/patrick-fix + [Fix 'timeWriting' stat for shuffle files] + + a1a7e36 Sun Nov 24 05:15:09 2013 -0800 + Merge pull request #75 from amplab/simplify + [Simplify GraphImpl internals] + + 718cc80 Sun Nov 24 11:02:02 2013 +0800 + Merge pull request #200 from mateiz/hash-fix + [AppendOnlyMap fixes] + + 51aa9d6 Sat Nov 23 19:46:46 2013 +0800 + Merge pull request #198 from ankurdave/zipPartitions-preservesPartitioning + [Support preservesPartitioning in RDD.zipPartitions] + + 18ce7e9 Fri Nov 22 17:02:40 2013 -0800 + Merge pull request #73 from jegonzal/TriangleCount + [Triangle count] + + 086b097 Fri Nov 22 10:26:39 2013 +0800 + Merge pull request #193 from aoiwelle/patch-1 + [Fix Kryo Serializer buffer documentation inconsistency] + + f20093c Fri Nov 22 10:12:13 2013 +0800 + Merge pull request #196 from pwendell/master + [TimeTrackingOutputStream should pass on calls to close() and flush().] + + 4b89501 Wed Nov 20 10:36:10 2013 -0800 + Merge pull request #191 from hsaputra/removesemicolonscala + [Cleanup to remove semicolons (;) from Scala code] + + 202f8e6 Wed Nov 20 03:26:08 2013 -0800 + Merge pull request #74 from dcrankshaw/remove_sleep + [Removed sleep from pagerank in Analytics] + + 74ade9e Tue Nov 19 16:53:58 2013 -0800 + Merge pull request #62 from dcrankshaw/partitioners + [Allow user to choose a partitioner at runtime] + + f568912 Tue Nov 19 16:11:31 2013 -0800 + Merge pull request #181 from BlackNiuza/fix_tasks_number + [correct number of tasks in ExecutorsUI] + + aa638ed Tue Nov 19 16:05:44 2013 -0800 + Merge pull request #189 from tgravescs/sparkYarnErrorHandling + [Impove Spark on Yarn Error handling] + + 5592580 Tue Nov 19 16:04:01 2013 -0800 + Merge pull request #187 from aarondav/example-bcast-test + [Enable the Broadcast examples to work in a cluster setting] + + 99cfe89 Mon Nov 18 22:00:36 2013 -0500 + Updates to reflect pull request code review + [] + + e2ebc3a Sun Nov 17 18:42:18 2013 -0800 + Merge pull request #182 from rxin/vector + [Slightly enhanced PrimitiveVector:] + + 26f616d Sun Nov 17 18:18:16 2013 -0800 + Merge pull request #3 from aarondav/pv-test + [Add PrimitiveVectorSuite and fix bug in resize()] + + 1b5b358 Sat Nov 16 11:44:10 2013 -0800 + Merge pull request #178 from hsaputra/simplecleanupcode + [Simple cleanup on Spark's Scala code] + + 62a2a71 Fri Nov 15 13:12:07 2013 -0800 + Merge pull request #65 from amplab/varenc + [Use variable encoding for ints, longs, and doubles in the specialized serializers.] + + f6b2e59 Thu Nov 14 23:04:55 2013 -0800 + Merge pull request #1 from aarondav/scala210-master + [Various merge corrections] + + 96e0fb4 Thu Nov 14 22:29:28 2013 -0800 + Merge pull request #173 from kayousterhout/scheduler_hang + [Fix bug where scheduler could hang after task failure.] + + dfd40e9 Thu Nov 14 19:44:50 2013 -0800 + Merge pull request #175 from kayousterhout/no_retry_not_serializable + [Don't retry tasks when they fail due to a NotSerializableException] + + ed25105 Thu Nov 14 19:43:55 2013 -0800 + Merge pull request #174 from ahirreddy/master + [Write Spark UI url to driver file on HDFS] + + 1a4cfbe Thu Nov 14 10:32:11 2013 -0800 + Merge pull request #169 from kayousterhout/mesos_fix + [Don't ignore spark.cores.max when using Mesos Coarse mode] + + 5a4f483 Thu Nov 14 10:30:36 2013 -0800 + Merge pull request #170 from liancheng/hadooprdd-doc-typo + [Fixed a scaladoc typo in HadoopRDD.scala] + + d76f520 Thu Nov 14 10:25:48 2013 -0800 + Merge pull request #171 from RIA-pierre-borckmans/master + [Fixed typos in the CDH4 distributions version codes.] + + 2c39d80 Wed Nov 13 23:28:01 2013 -0800 + Merge pull request #69 from jegonzal/MissingVertices + [Addressing issue in Graph creation] + + 33b2dea Wed Nov 13 17:55:58 2013 -0800 + Merge pull request #1 from ankurdave/MissingVertices + [During graph creation, create eTable earlier] + + 2054c61 Wed Nov 13 16:49:55 2013 -0800 + Merge pull request #159 from liancheng/dagscheduler-actor-refine + [Migrate the daemon thread started by DAGScheduler to Akka actor] + + 9290e5b Wed Nov 13 16:48:44 2013 -0800 + Merge pull request #165 from NathanHowell/kerberos-master + [spark-assembly.jar fails to authenticate with YARN ResourceManager] + + a81fcb7 Wed Nov 13 10:41:01 2013 -0800 + Merge pull request #68 from jegonzal/BitSetSetUntilBug + [Addressing bug in BitSet.setUntil(ind)] + + 39af914 Wed Nov 13 08:39:05 2013 -0800 + Merge pull request #166 from ahirreddy/simr-spark-ui + [SIMR Backend Scheduler will now write Spark UI URL to HDFS, which is to ...] + + f49ea28 Tue Nov 12 19:13:39 2013 -0800 + Merge pull request #137 from tgravescs/sparkYarnJarsHdfsRebase + [Allow spark on yarn to be run from HDFS.] + + 87f2f4e Tue Nov 12 16:26:09 2013 -0800 + Merge pull request #153 from ankurdave/stop-spot-cluster + [Enable stopping and starting a spot cluster] + + b8bf04a Tue Nov 12 16:19:50 2013 -0800 + Merge pull request #160 from xiajunluan/JIRA-923 + [Fix bug JIRA-923] + + dfd1ebc Tue Nov 12 09:10:05 2013 -0800 + Merge pull request #164 from tdas/kafka-fix + [Made block generator thread safe to fix Kafka bug.] + + 2e8d450 Mon Nov 11 17:34:09 2013 -0800 + Merge pull request #63 from jegonzal/VertexSetCleanup + [Cleanup of VertexSetRDD] + + b8e294a Mon Nov 11 16:25:42 2013 -0800 + Merge pull request #61 from ankurdave/pid2vid + [Shuffle replicated vertex attributes efficiently in columnar format] + + 3d7277c Mon Nov 11 15:49:28 2013 -0800 + Merge pull request #55 from ankurdave/aggregateNeighbors-variants + [Specialize mapReduceTriplets for accessing subsets of vertex attributes] + + 23b53ef Mon Nov 11 12:30:02 2013 -0800 + Merge pull request #156 from haoyuan/master + [add tachyon module] + + 1a06f70 Sun Nov 10 10:54:44 2013 -0800 + Merge pull request #60 from amplab/rxin + [Looks good to me.] + + 58d4f6c Sun Nov 10 09:23:56 2013 -0800 + Merge pull request #157 from rxin/kryo + [3 Kryo related changes.] + + 3efc019 Sat Nov 9 17:53:49 2013 -0800 + Merge pull request #147 from JoshRosen/fix-java-api-completeness-checker + [Add spark-tools assembly to spark-class'ss classpath] + + 87954d4 Sat Nov 9 17:53:25 2013 -0800 + Merge pull request #154 from soulmachine/ClusterScheduler + [Replace the thread inside ClusterScheduler.start() with an Akka scheduler] + + f6c9462 Sat Nov 9 16:14:45 2013 -0800 + Merge pull request #58 from jegonzal/KryoMessages + [Kryo messages] + + 83bf192 Sat Nov 9 15:40:29 2013 -0800 + Merge pull request #155 from rxin/jobgroup + [Don't reset job group when a new job description is set.] + + 8af99f2 Sat Nov 9 13:48:00 2013 -0800 + Merge pull request #149 from tgravescs/fixSecureHdfsAccess + [Fix secure hdfs access for spark on yarn] + + 72a601e Sat Nov 9 11:55:16 2013 -0800 + Merge pull request #152 from rxin/repl + [Propagate SparkContext local properties from spark-repl caller thread to the repl execution thread.] + + 6ee05be Thu Nov 7 19:12:41 2013 -0800 + Merge pull request #49 from jegonzal/graphxshell + [GraphX Console with Logo Text] + + a9f96b5 Thu Nov 7 18:56:56 2013 -0800 + Merge pull request #56 from jegonzal/PregelAPIChanges + [Changing Pregel API to use mapReduceTriplets instead of aggregateNeighbors] + + 5907137 Thu Nov 7 16:58:31 2013 -0800 + Merge pull request #54 from amplab/rxin + [Converted for loops to while loops in EdgePartition.] + + edf4164 Thu Nov 7 16:22:43 2013 -0800 + Merge pull request #53 from amplab/rxin + [Added GraphX to classpath.] + + c379e10 Thu Nov 7 16:01:47 2013 -0800 + Merge pull request #51 from jegonzal/VertexSetRDD + [Reverting to Array based (materialized) output in VertexSetRDD] + + 3d4ad84 Thu Nov 7 11:08:27 2013 -0800 + Merge pull request #148 from squito/include_appId + [Include appId in executor cmd line args] + + be7e8da Wed Nov 6 23:22:47 2013 -0800 + Merge pull request #23 from jerryshao/multi-user + [Add Spark multi-user support for standalone mode and Mesos] + + aadeda5 Wed Nov 6 13:27:47 2013 -0800 + Merge pull request #144 from liancheng/runjob-clean + [Removed unused return value in SparkContext.runJob] + + 951024f Wed Nov 6 09:36:14 2013 -0800 + Merge pull request #145 from aarondav/sls-fix + [Attempt to fix SparkListenerSuite breakage] + + bf4e613 Tue Nov 5 23:14:09 2013 -0800 + Merge pull request #143 from rxin/scheduler-hang + [Ignore a task update status if the executor doesn't exist anymore.] + + 9f7b9bb Tue Nov 5 10:42:19 2013 -0800 + Merge pull request #142 from liancheng/dagscheduler-pattern-matching + [Using case class deep match to simplify code in DAGScheduler.processEvent] + + ca44b51 Tue Nov 5 01:32:55 2013 -0800 + Merge pull request #50 from amplab/mergemerge + [Merge Spark master into graphx] + + 8106532 Mon Nov 4 20:47:14 2013 -0800 + Merge pull request #139 from aarondav/shuffle-next + [Never store shuffle blocks in BlockManager] + + 0b26a39 Mon Nov 4 18:22:06 2013 -0800 + Merge pull request #128 from shimingfei/joblogger-doc + [add javadoc to JobLogger, and some small fix] + + 7a26104 Mon Nov 4 17:54:06 2013 -0800 + Merge pull request #130 from aarondav/shuffle + [Memory-optimized shuffle file consolidation] + + b5dc339 Sun Nov 3 20:43:15 2013 -0800 + Merge pull request #70 from rxin/hash1 + [Fast, memory-efficient hash set, hash table implementations optimized for primitive data types.] + + 41ead7a Sat Nov 2 14:41:50 2013 -0700 + Merge pull request #133 from Mistobaan/link_fix + [update default github] + + d407c07 Sat Nov 2 14:36:37 2013 -0700 + Merge pull request #134 from rxin/readme + [Fixed a typo in Hadoop version in README.] + + e7c7b80 Fri Nov 1 17:58:10 2013 -0700 + Merge pull request #132 from Mistobaan/doc_fix + [fix persistent-hdfs] + + d6d11c2 Fri Nov 1 15:40:33 2013 -0700 + Merge pull request #129 from velvia/2013-11/document-local-uris + [Document & finish support for local: URIs] + + 99bfcc9 Thu Oct 31 21:38:10 2013 -0700 + Merge pull request #46 from jegonzal/VertexSetWithHashSet + [Switched VertexSetRDD and GraphImpl to use OpenHashSet] + + fcaaf86 Thu Oct 31 18:27:30 2013 -0700 + Merge pull request #44 from jegonzal/rxinBitSet + [Switching to VertexSetRDD to use @rxin BitSet and OpenHash ] + + 3f3c727 Thu Oct 31 09:52:25 2013 -0700 + Merge pull request #41 from jegonzal/LineageTracking + [Optimizing Graph Lineage] + + 944f6b8 Thu Oct 31 09:40:35 2013 -0700 + Merge pull request #43 from amplab/FixBitSetCastException + [Fix BitSet cast exception] + + 8f1098a Wed Oct 30 20:11:48 2013 -0700 + Merge pull request #117 from stephenh/avoid_concurrent_modification_exception + [Handle ConcurrentModificationExceptions in SparkContext init.] + + dc9ce16 Wed Oct 30 17:01:56 2013 -0700 + Merge pull request #126 from kayousterhout/local_fix + [Fixed incorrect log message in local scheduler] + + 33de11c Wed Oct 30 16:58:27 2013 -0700 + Merge pull request #124 from tgravescs/sparkHadoopUtilFix + [Pull SparkHadoopUtil out of SparkEnv (jira SPARK-886)] + + a0c86c3 Wed Oct 30 15:34:39 2013 -0700 + Merge pull request #38 from jegonzal/Documentation + [Improving Documentation] + + 618c1f6 Wed Oct 30 12:03:44 2013 -0700 + Merge pull request #125 from velvia/2013-10/local-jar-uri + [Add support for local:// URI scheme for addJars()] + + 745dc42 Tue Oct 29 23:47:10 2013 -0700 + Merge pull request #118 from JoshRosen/blockinfo-memory-usage + [Reduce the memory footprint of BlockInfo objects] + + 06adf63 Tue Oct 29 16:43:46 2013 -0700 + Merge pull request #33 from kellrott/master + [Fixing graph/pom.xml] + + 098768e Tue Oct 29 15:08:36 2013 -0700 + Merge pull request #37 from jegonzal/AnalyticsCleanup + [Updated Connected Components and Pregel Docs] + + f0e23a0 Tue Oct 29 01:41:44 2013 -0400 + Merge pull request #119 from soulmachine/master + [A little revise for the document] + + aec9bf9 Sun Oct 27 19:32:00 2013 -0700 + Merge pull request #112 from kayousterhout/ui_task_attempt_id + [Display both task ID and task attempt ID in UI, and rename taskId to taskAttemptId] + + d4df474 Sun Oct 27 22:11:21 2013 -0400 + Merge pull request #115 from aarondav/shuffle-fix + [Eliminate extra memory usage when shuffle file consolidation is disabled] + + e018f2d Sat Oct 26 11:39:15 2013 -0700 + Merge pull request #113 from pwendell/master + [Improve error message when multiple assembly jars are present.] + + 662ee9f Sat Oct 26 11:35:59 2013 -0700 + Merge pull request #114 from soulmachine/master + [A little revise for the document] + + bab496c Fri Oct 25 18:28:43 2013 -0700 + Merge pull request #108 from alig/master + [Changes to enable executing by using HDFS as a synchronization point between driver and executors, as well as ensuring executors exit properly.] + + d307db6 Fri Oct 25 17:26:06 2013 -0700 + Merge pull request #102 from tdas/transform + [Added new Spark Streaming operations] + + 85e2cab Fri Oct 25 14:46:06 2013 -0700 + Merge pull request #111 from kayousterhout/ui_name + [Properly display the name of a stage in the UI.] + + ab35ec4 Fri Oct 25 10:16:18 2013 -0700 + Merge pull request #110 from pwendell/master + [Exclude jopt from kafka dependency.] + + 4f2c943 Thu Oct 24 22:32:02 2013 -0700 + Merge pull request #109 from pwendell/master + [Adding Java/Java Streaming versions of `repartition` with associated tests] + + 99ad4a6 Thu Oct 24 17:08:39 2013 -0700 + Merge pull request #106 from pwendell/master + [Add a `repartition` operator.] + + 5429d62 Thu Oct 24 11:15:55 2013 -0700 + Merge pull request #107 from ScrapCodes/scala-2.10 + [Updating to latest akka 2.2.3, which fixes our only failing test Driver Suite.] + + 6f82c42 Thu Oct 24 11:09:46 2013 -0700 + Merge pull request #34 from jegonzal/AnalyticsCleanup + [Analytics Cleanup] + + 1dc776b Wed Oct 23 22:05:52 2013 -0700 + Merge pull request #93 from kayousterhout/ui_new_state + [Show "GETTING_RESULTS" state in UI.] + + c4b187d Wed Oct 23 21:56:18 2013 -0700 + Merge pull request #105 from pwendell/doc-fix + [Fixing broken links in programming guide] + + a098438 Wed Oct 23 18:03:08 2013 -0700 + Merge pull request #103 from JoshRosen/unpersist-fix + [Add unpersist() to JavaDoubleRDD and JavaPairRDD.] + + dd65964 Wed Oct 23 15:07:59 2013 -0700 + Merge pull request #64 from prabeesh/master + [MQTT Adapter for Spark Streaming] + + 452aa36 Tue Oct 22 23:15:33 2013 -0700 + Merge pull request #97 from ewencp/pyspark-system-properties + [Add classmethod to SparkContext to set system properties.] + + 9dfcf53 Tue Oct 22 16:01:42 2013 -0700 + Merge pull request #100 from JoshRosen/spark-902 + [Remove redundant Java Function call() definitions] + + 49d5cda Tue Oct 22 15:38:02 2013 -0700 + Merge pull request #30 from jegonzal/VertexSetRDD_Tests + [Testing and Documenting VertexSetRDD] + + 97184de Tue Oct 22 13:10:14 2013 -0700 + Merge pull request #99 from pwendell/master + [Use correct formatting for comments in StoragePerfTester] + + c404adb Tue Oct 22 11:30:19 2013 -0700 + Merge pull request #90 from pwendell/master + [SPARK-940: Do not directly pass Stage objects to SparkListener.] + + aa9019f Tue Oct 22 10:30:02 2013 -0700 + Merge pull request #98 from aarondav/docs + [Docs: Fix links to RDD API documentation] + + a0e08f0 Tue Oct 22 10:20:43 2013 -0700 + Merge pull request #82 from JoshRosen/map-output-tracker-refactoring + [Split MapOutputTracker into Master/Worker classes] + + b84193c Mon Oct 21 23:35:13 2013 -0700 + Merge pull request #92 from tgravescs/sparkYarnFixClasspath + [Fix the Worker to use CoarseGrainedExecutorBackend and modify classpath ...] + + 731c94e Mon Oct 21 23:31:38 2013 -0700 + Merge pull request #56 from jerryshao/kafka-0.8-dev + [Upgrade Kafka 0.7.2 to Kafka 0.8.0-beta1 for Spark Streaming] + + 48952d6 Mon Oct 21 22:45:00 2013 -0700 + Merge pull request #87 from aarondav/shuffle-base + [Basic shuffle file consolidation] + + a51359c Mon Oct 21 20:33:29 2013 -0700 + Merge pull request #95 from aarondav/perftest + [Minor: Put StoragePerfTester in org/apache/] + + 39d2e9b Mon Oct 21 18:58:48 2013 -0700 + Merge pull request #94 from aarondav/mesos-fix + [Fix mesos urls] + + aa61bfd Mon Oct 21 11:57:05 2013 -0700 + Merge pull request #88 from rxin/clean + [Made the following traits/interfaces/classes non-public:] + + 35886f3 Sun Oct 20 22:20:32 2013 -0700 + Merge pull request #41 from pwendell/shuffle-benchmark + [Provide Instrumentation for Shuffle Write Performance] + + 5b9380e Sun Oct 20 21:03:51 2013 -0700 + Merge pull request #89 from rxin/executor + [Don't setup the uncaught exception handler in local mode.] + + 261bcf2 Sun Oct 20 17:59:51 2013 -0700 + Merge pull request #80 from rxin/build + [Exclusion rules for Maven build files.] + + edc5e3f Sun Oct 20 17:18:06 2013 -0700 + Merge pull request #75 from JoshRosen/block-manager-cleanup + [Code de-duplication in BlockManager] + + 2a7ae17 Sun Oct 20 11:45:21 2013 -0700 + Merge pull request #84 from rxin/kill1 + [Added documentation for setJobGroup. Also some minor cleanup in SparkContext.] + + e4abb75 Sun Oct 20 09:38:37 2013 -0700 + Merge pull request #85 from rxin/clean + [Moved the top level spark package object from spark to org.apache.spark] + + 136b9b3 Sun Oct 20 02:58:26 2013 -0700 + Basic shuffle file consolidation + [The Spark shuffle phase can produce a large number of files, as one file is created] + + 747f538 Sat Oct 19 23:40:40 2013 -0700 + Merge pull request #83 from ewencp/pyspark-accumulator-add-method + [Add an add() method to pyspark accumulators.] + + 6511bbe Sat Oct 19 11:34:56 2013 -0700 + Merge pull request #78 from mosharaf/master + [Removed BitTorrentBroadcast and TreeBroadcast.] + + f628804 Fri Oct 18 23:19:42 2013 -0700 + Merge pull request #76 from pwendell/master + [Clarify compression property.] + + 599dcb0 Fri Oct 18 22:49:00 2013 -0700 + Merge pull request #74 from rxin/kill + [Job cancellation via job group id.] + + 9cf43cf Fri Oct 18 22:07:21 2013 -0700 + Merge pull request #28 from jegonzal/VertexSetRDD + [Refactoring IndexedRDD to VertexSetRDD.] + + f888a5b Fri Oct 18 22:06:58 2013 -0700 + Merge pull request #29 from ankurdave/unit-tests + [Unit tests for Graph and GraphOps] + + 8de9706 Fri Oct 18 20:32:39 2013 -0700 + Merge pull request #66 from shivaram/sbt-assembly-deps + [Add SBT target to assemble dependencies] + + e5316d0 Fri Oct 18 20:30:56 2013 -0700 + Merge pull request #68 from mosharaf/master + [Faster and stable/reliable broadcast] + + 8d528af Fri Oct 18 20:24:10 2013 -0700 + Merge pull request #71 from aarondav/scdefaults + [Spark shell exits if it cannot create SparkContext] + + 0794bd7 Fri Oct 18 18:59:58 2013 -0700 + Merge pull request #27 from jegonzal/removed_indexedrdd_from_core + [Removing IndexedRDD changes for spark/core] + + 099977f Thu Oct 17 14:17:08 2013 -0700 + Merge pull request #26 from ankurdave/split-vTableReplicated + [Great work!] + + fc26e5b Thu Oct 17 13:21:07 2013 -0700 + Merge pull request #69 from KarthikTunga/master + [Fix for issue SPARK-627. Implementing --config argument in the scripts.] + + cf64f63 Thu Oct 17 11:12:28 2013 -0700 + Merge pull request #67 from kayousterhout/remove_tsl + [Removed TaskSchedulerListener interface.] + + f9973ca Wed Oct 16 15:58:41 2013 -0700 + Merge pull request #65 from tgravescs/fixYarn + [Fix yarn build] + + 28e9c2a Tue Oct 15 23:59:56 2013 -0700 + Merge pull request #63 from pwendell/master + [Fixing spark streaming example and a bug in examples build.] + + 4e46fde Tue Oct 15 23:14:27 2013 -0700 + Merge pull request #62 from harveyfeng/master + [Make TaskContext's stageId publicly accessible.] + + b534606 Tue Oct 15 21:25:03 2013 -0700 + Merge pull request #8 from vchekan/checkpoint-ttl-restore + [Serialize and restore spark.cleaner.ttl to savepoint] + + 6dbd220 Tue Oct 15 19:02:57 2013 -0700 + Merge pull request #34 from kayousterhout/rename + [Renamed StandaloneX to CoarseGrainedX.] + + 983b83f Tue Oct 15 19:02:46 2013 -0700 + Merge pull request #61 from kayousterhout/daemon_thread + [Unified daemon thread pools] + + 3249e0e Tue Oct 15 14:12:33 2013 -0700 + Merge pull request #59 from rxin/warning + [Bump up logging level to warning for failed tasks.] + + 678dec6 Tue Oct 15 10:51:46 2013 -0700 + Merge pull request #58 from hsaputra/update-pom-asf + [Update pom.xml to use version 13 of the ASF parent pom] + + e33b183 Mon Oct 14 22:25:47 2013 -0700 + Merge pull request #29 from rxin/kill + [Job killing] + + 3b11f43 Mon Oct 14 14:20:01 2013 -0700 + Merge pull request #57 from aarondav/bid + [Refactor BlockId into an actual type] + + 9979690 Sat Oct 12 21:23:26 2013 -0700 + Merge pull request #52 from harveyfeng/hadoop-closure + [Add an optional closure parameter to HadoopRDD instantiation to use when creating local JobConfs.] + + dca8009 Fri Oct 11 16:08:15 2013 -0700 + Merge pull request #54 from aoiwelle/remove_unused_imports + [Remove unnecessary mutable imports] + + 0e5052b Fri Oct 11 15:45:16 2013 -0700 + Merge pull request #51 from ScrapCodes/scala-2.10 + [Scala 2.10] + + fb25f32 Fri Oct 11 15:44:43 2013 -0700 + Merge pull request #53 from witgo/master + [Add a zookeeper compile dependency to fix build in maven] + + d6ead47 Fri Oct 11 15:43:01 2013 -0700 + Merge pull request #32 from mridulm/master + [Address review comments, move to incubator spark] + + c71499b Thu Oct 10 17:16:42 2013 -0700 + Merge pull request #19 from aarondav/master-zk + [Standalone Scheduler fault tolerance using ZooKeeper] + + 5867a82 Thu Oct 10 14:02:37 2013 -0700 + Merge pull request #19 from dcrankshaw/master + [Merge canonical 2d partitioner and group edges into benchmarks] + + cd08f73 Thu Oct 10 13:55:47 2013 -0700 + Merge pull request #44 from mateiz/fast-map + [A fast and low-memory append-only map for shuffle operations] + + 4b46d51 Thu Oct 10 13:35:36 2013 -0700 + Merge pull request #17 from amplab/product2 + [product 2 change] + + 320418f Wed Oct 9 16:55:30 2013 -0700 + Merge pull request #49 from mateiz/kryo-fix-2 + [Fix Chill serialization of Range objects] + + 215238c Wed Oct 9 16:49:44 2013 -0700 + Merge pull request #50 from kayousterhout/SPARK-908 + [Fix race condition in SparkListenerSuite (fixes SPARK-908).] + + 7827efc Wed Oct 9 15:07:25 2013 -0700 + Merge pull request #46 from mateiz/py-sort-update + [Fix PySpark docs and an overly long line of code after #38] + + 7b3ae04 Wed Oct 9 12:14:19 2013 -0700 + Merge pull request #45 from pwendell/metrics_units + [Use standard abbreviation in metrics description (MBytes -> MB)] + + b4fa11f Wed Oct 9 11:59:47 2013 -0700 + Merge pull request #38 from AndreSchumacher/pyspark_sorting + [SPARK-705: implement sortByKey() in PySpark] + + 19d445d Wed Oct 9 11:08:34 2013 -0700 + Merge pull request #22 from GraceH/metrics-naming + [SPARK-900 Use coarser grained naming for metrics] + + 7d50f9f Wed Oct 9 10:32:42 2013 -0700 + Merge pull request #35 from MartinWeindel/scala-2.10 + [Fixing inconsistencies and warnings on Scala 2.10 branch] + + 3218fa7 Tue Oct 8 23:44:55 2013 -0700 + Merge pull request #4 from MLnick/implicit-als + [Adding algorithm for implicit feedback data to ALS] + + e67d5b9 Tue Oct 8 22:57:38 2013 -0700 + Merge pull request #43 from mateiz/kryo-fix + [Don't allocate Kryo buffers unless needed] + + ea34c52 Mon Oct 7 20:45:58 2013 -0700 + Merge pull request #42 from pwendell/shuffle-read-perf + [Fix inconsistent and incorrect log messages in shuffle read path] + + 02f37ee Mon Oct 7 15:48:52 2013 -0700 + Merge pull request #39 from pwendell/master + [Adding Shark 0.7.1 to EC2 scripts] + + 213b70a Mon Oct 7 10:54:22 2013 -0700 + Merge pull request #31 from sundeepn/branch-0.8 + [Resolving package conflicts with hadoop 0.23.9] + + d585613 Sat Oct 5 22:57:05 2013 -0700 + Merge pull request #37 from pwendell/merge-0.8 + [merge in remaining changes from `branch-0.8`] + + 4a25b11 Sat Oct 5 19:28:55 2013 -0700 + Merge pull request #20 from harveyfeng/hadoop-config-cache + [Allow users to pass broadcasted Configurations and cache InputFormats across Hadoop file reads.] + + 8fc68d0 Sat Oct 5 17:24:35 2013 -0700 + Merge pull request #36 from pwendell/versions + [Bumping EC2 default version in master to .] + + 100222b Sat Oct 5 13:38:59 2013 -0700 + Merge pull request #27 from davidmccauley/master + [SPARK-920/921 - JSON endpoint updates] + + 0864193 Sat Oct 5 13:25:18 2013 -0700 + Merge pull request #33 from AndreSchumacher/pyspark_partition_key_change + [Fixing SPARK-602: PythonPartitioner] + + 61ffcde Fri Oct 4 10:52:17 2013 -0700 + Merge pull request #15 from dcrankshaw/master + [Add synthetic generators] + + 3fe12cc Fri Oct 4 10:51:28 2013 -0700 + Merge pull request #946 from ScrapCodes/scala-2.10 + [Fixed non termination of Executor backend, when sc.stop is not called and system.exit instead.] + + 232765f Thu Oct 3 12:00:48 2013 -0700 + Merge pull request #26 from Du-Li/master + [fixed a wildcard bug in make-distribution.sh; ask sbt to check local] + + 405e69b Thu Oct 3 10:52:41 2013 -0700 + Merge pull request #25 from CruncherBigData/master + [Update README: updated the link] + + 49dbfcc Thu Oct 3 10:52:06 2013 -0700 + Merge pull request #28 from tgravescs/sparYarnAppName + [Allow users to set the application name for Spark on Yarn] + + e597ea3 Wed Oct 2 21:14:24 2013 -0700 + Merge pull request #10 from kayousterhout/results_through-bm + [Send Task results through the block manager when larger than Akka frame size (fixes SPARK-669).] + + 714fdab Thu Sep 26 14:28:55 2013 -0700 + Merge pull request #17 from rxin/optimize + [Remove -optimize flag] + + 13eced7 Thu Sep 26 14:18:19 2013 -0700 + Merge pull request #16 from pwendell/master + [Bug fix in master build] + + 70a0b99 Thu Sep 26 14:11:54 2013 -0700 + Merge pull request #14 from kayousterhout/untangle_scheduler + [Improved organization of scheduling packages.] + + afd03b2 Thu Sep 26 14:09:55 2013 -0700 + Merge pull request #943 from ScrapCodes/scala-2.10 + [Scala 2.10 with akka 2.2] + + 76677b8 Thu Sep 26 14:03:46 2013 -0700 + Merge pull request #670 from jey/ec2-ssh-improvements + [EC2 SSH improvements] + + c514cd1 Thu Sep 26 13:48:20 2013 -0700 + Merge pull request #930 from holdenk/master + [Add mapPartitionsWithIndex] + + 560ee5c Thu Sep 26 11:27:34 2013 -0700 + Merge pull request #7 from wannabeast/memorystore-fixes + [some minor fixes to MemoryStore] + + 6566a19 Thu Sep 26 08:01:04 2013 -0700 + Merge pull request #9 from rxin/limit + [Smarter take/limit implementation.] + + 834686b Sun Sep 22 15:06:48 2013 -0700 + Merge pull request #928 from jerryshao/fairscheduler-refactor + [Refactor FairSchedulableBuilder] + + a2ea069 Sat Sep 21 23:04:42 2013 -0700 + Merge pull request #937 from jerryshao/localProperties-fix + [Fix PR926 local properties issues in Spark Streaming like scenarios] + + f06f2da Sat Sep 21 22:43:34 2013 -0700 + Merge pull request #941 from ilikerps/master + [Add "org.apache." prefix to packages in spark-class] + + 7bb12a2 Sat Sep 21 22:42:46 2013 -0700 + Merge pull request #940 from ankurdave/clear-port-properties-after-tests + [After unit tests, clear port properties unconditionally] + + a00317b Fri Sep 20 11:29:31 2013 -0700 + Merge pull request #1 from ankurdave/aggregateNeighbors-returns-graph + [Return Graph from Graph.aggregateNeighbors] + + 6a5e665 Thu Sep 19 22:41:44 2013 -0700 + Merge pull request #3 from ankurdave/clear-port-properties-after-tests + [After unit tests, clear port properties unconditionally ] + + 68ad33a Thu Sep 19 21:30:27 2013 -0700 + Merge pull request #2 from ankurdave/package-fixes + [Package fixes (spark.graph -> org.apache.spark.graph)] + + cd7222c Thu Sep 19 14:21:24 2013 -0700 + Merge pull request #938 from ilikerps/master + [Fix issue with spark_ec2 seeing empty security groups] + + e0dd24d Sat Aug 31 17:54:15 2013 -0700 + Merge pull request #879 from AndreSchumacher/scala-2.10 + [PySpark: replacing class manifest by class tag for Scala 2.10.2 in rdd.py] + + ad61349 Thu Jul 18 13:53:48 2013 -0700 + Merge pull request #709 from ScrapCodes/scala-2.10 + [Fixed warnings in scala 2.10 branch.] + + a289ded Mon Jul 15 15:59:43 2013 -0700 + Merge pull request #700 from ScrapCodes/scala-2.10 + [Scala 2.10 ] + + 1044a95 Fri Jun 14 20:04:24 2013 -0700 + Merge pull request #652 from ScrapCodes/scala-2.10 + [Fixed maven build without netty fix] + + 4b57f83 Sat Apr 20 10:40:07 2013 -0700 + Merge pull request #535 from ScrapCodes/scala-2.10-repl-port + [porting of repl to scala-2.10] + + 73b3fee Sun Jan 20 10:11:49 2013 -0800 + Merge pull request #388 from folone/master + [Updated maven build configuration for Scala 2.10] + + 20adf27 Tue Jan 15 11:03:49 2013 -0800 + Merge pull request #371 from folone/master + [Scala 2.10.0] + +Release 0.8.0-incubating + + 2aff798 Sun Sep 15 14:05:04 2013 -0700 + Merge pull request #933 from jey/yarn-typo-fix + [Fix typo in Maven build docs] + + dbd2c4f Sun Sep 15 13:20:41 2013 -0700 + Merge pull request #932 from pwendell/mesos-version + [Bumping Mesos version to 0.13.0] + + 9fb0b9d Sun Sep 15 13:02:53 2013 -0700 + Merge pull request #931 from pwendell/yarn-docs + [Explain yarn.version in Maven build docs] + + c4c1db2 Fri Sep 13 19:52:12 2013 -0700 + Merge pull request #929 from pwendell/master + [Use different Hadoop version for YARN artifacts.] + + a310de6 Wed Sep 11 19:36:11 2013 -0700 + Merge pull request #926 from kayousterhout/dynamic + [Changed localProperties to use ThreadLocal (not DynamicVariable).] + + 58c7d8b Wed Sep 11 17:33:42 2013 -0700 + Merge pull request #927 from benh/mesos-docs + [Updated Spark on Mesos documentation.] + + 91a59e6 Wed Sep 11 10:21:48 2013 -0700 + Merge pull request #919 from mateiz/jets3t + [Add explicit jets3t dependency, which is excluded in hadoop-client] + + b9128d3 Wed Sep 11 10:03:06 2013 -0700 + Merge pull request #922 from pwendell/port-change + [Change default port number from 3030 to 4030.] + + e07eef8 Wed Sep 11 07:35:39 2013 -0700 + Merge pull request #925 from davidmccauley/master + [SPARK-894 - Not all WebUI fields delivered VIA JSON] + + 8432f27 Tue Sep 10 23:19:53 2013 -0700 + Merge pull request #923 from haoyuan/master + [fix run-example script] + + d40f140 Tue Sep 10 23:05:29 2013 -0700 + Merge pull request #921 from pwendell/master + [Fix HDFS access bug with assembly build.] + + 0a6c051 Mon Sep 9 23:37:57 2013 -0700 + Merge pull request #918 from pwendell/branch-0.8 + [Update versions for 0.8.0 release.] + + 8c14f4b Mon Sep 9 22:07:58 2013 -0700 + Merge pull request #917 from pwendell/master + [Document libgfortran dependency for MLBase] + + c81377b Mon Sep 9 20:16:19 2013 -0700 + Merge pull request #915 from ooyala/master + [Get rid of / improve ugly NPE when Utils.deleteRecursively() fails] + + 61d2a01 Mon Sep 9 18:21:01 2013 -0700 + Merge pull request #916 from mateiz/mkdist-fix + [Fix copy issue in https://github.com/mesos/spark/pull/899] + + a85758c Mon Sep 9 13:45:40 2013 -0700 + Merge pull request #907 from stephenh/document_coalesce_shuffle + [Add better docs for coalesce.] + + 084fc36 Mon Sep 9 12:01:35 2013 -0700 + Merge pull request #912 from tgravescs/ganglia-pom + [Add metrics-ganglia to core pom file] + + 0456384 Mon Sep 9 09:57:54 2013 -0700 + Merge pull request #911 from pwendell/ganglia-sink + [Adding Manen dependency for Ganglia] + + bf984e2 Sun Sep 8 23:50:24 2013 -0700 + Merge pull request #890 from mridulm/master + [Fix hash bug] + + e9d4f44 Sun Sep 8 23:36:48 2013 -0700 + Merge pull request #909 from mateiz/exec-id-fix + [Fix an instance where full standalone mode executor IDs were passed to] + + 2447b1c Sun Sep 8 22:27:49 2013 -0700 + Merge pull request #910 from mateiz/ml-doc-tweaks + [Small tweaks to MLlib docs] + + 7d3204b Sun Sep 8 21:39:12 2013 -0700 + Merge pull request #905 from mateiz/docs2 + [Job scheduling and cluster mode docs] + + f1f8371 Sun Sep 8 21:26:11 2013 -0700 + Merge pull request #896 from atalwalkar/master + [updated content] + + f68848d Sun Sep 8 18:32:16 2013 -0700 + Merge pull request #906 from pwendell/ganglia-sink + [Clean-up of Metrics Code/Docs and Add Ganglia Sink] + + 0b95799 Sun Sep 8 15:30:16 2013 -0700 + Merge pull request #908 from pwendell/master + [Fix target JVM version in scala build] + + 04cfb3a Sun Sep 8 10:33:20 2013 -0700 + Merge pull request #898 from ilikerps/660 + [SPARK-660: Add StorageLevel support in Python] + + 38488ac Sun Sep 8 00:28:53 2013 -0700 + Merge pull request #900 from pwendell/cdh-docs + [Provide docs to describe running on CDH/HDP cluster.] + + a8e376e Sat Sep 7 21:16:01 2013 -0700 + Merge pull request #904 from pwendell/master + [Adding Apache license to two files] + + cfde85e Sat Sep 7 13:53:08 2013 -0700 + Merge pull request #901 from ooyala/2013-09/0.8-doc-changes + [0.8 Doc changes for make-distribution.sh] + + 4a7813a Sat Sep 7 13:52:24 2013 -0700 + Merge pull request #903 from rxin/resulttask + [Fixed the bug that ResultTask was not properly deserializing outputId.] + + afe46ba Sat Sep 7 07:28:51 2013 -0700 + Merge pull request #892 from jey/fix-yarn-assembly + [YARN build fixes] + + 2eebeff Fri Sep 6 15:25:22 2013 -0700 + Merge pull request #897 from pwendell/master + [Docs describing Spark monitoring and instrumentation] + + ddcb9d3 Thu Sep 5 23:54:09 2013 -0700 + Merge pull request #895 from ilikerps/821 + [SPARK-821: Don't cache results when action run locally on driver] + + 699c331 Thu Sep 5 20:21:53 2013 -0700 + Merge pull request #891 from xiajunluan/SPARK-864 + [[SPARK-864]DAGScheduler Exception if we delete Worker and StandaloneExecutorBackend then add Worker] + + 5c7494d Wed Sep 4 22:47:03 2013 -0700 + Merge pull request #893 from ilikerps/master + [SPARK-884: Add unit test to validate Spark JSON output] + + a547866 Wed Sep 4 21:11:56 2013 -0700 + Merge pull request #894 from c0s/master + [Updating assembly README to reflect recent changes in the build.] + + 19f7027 Tue Sep 3 14:29:10 2013 -0700 + Merge pull request #878 from tgravescs/yarnUILink + [Link the Spark UI up to the Yarn UI ] + + 68df246 Tue Sep 3 13:01:17 2013 -0700 + Merge pull request #889 from alig/master + [Return the port the WebUI is bound to (useful if port 0 was used)] + + d3dd48f Mon Sep 2 16:44:54 2013 -0700 + Merge pull request #887 from mateiz/misc-fixes + [Miscellaneous fixes for 0.8] + + 636fc0c Mon Sep 2 11:20:39 2013 -0700 + Merge pull request #886 from mateiz/codec + [Fix spark.io.compression.codec and change default codec to LZF] + + d9a53b9 Sun Sep 1 22:12:30 2013 -0700 + Merge pull request #885 from mateiz/win-py + [Allow PySpark to run on Windows] + + 3c520fe Sun Sep 1 17:26:55 2013 -0700 + Merge pull request #884 from mateiz/win-fixes + [Run script fixes for Windows after package & assembly change] + + f957c26 Sun Sep 1 14:53:57 2013 -0700 + Merge pull request #882 from mateiz/package-rename + [Rename spark package to org.apache.spark] + + a30fac1 Sun Sep 1 12:27:50 2013 -0700 + Merge pull request #883 from alig/master + [Don't require the spark home environment variable to be set for standalone mode (change needed by SIMR)] + + 03cc765 Sun Sep 1 10:20:56 2013 -0700 + Merge pull request #881 from pwendell/master + [Extend QuickStart to include next steps] + + 0e9565a Sat Aug 31 18:55:41 2013 -0700 + Merge pull request #880 from mateiz/ui-tweaks + [Various UI tweaks] + + 2b29a1d Sat Aug 31 17:49:45 2013 -0700 + Merge pull request #877 from mateiz/docs + [Doc improvements for 0.8] + + 6edef9c Sat Aug 31 13:39:24 2013 -0700 + Merge pull request #861 from AndreSchumacher/pyspark_sampling_function + [Pyspark sampling function] + + fd89835 Sat Aug 31 13:18:12 2013 -0700 + Merge pull request #870 from JoshRosen/spark-885 + [Don't send SIGINT / ctrl-c to Py4J gateway subprocess] + + 618f0ec Fri Aug 30 18:17:13 2013 -0700 + Merge pull request #869 from AndreSchumacher/subtract + [PySpark: implementing subtractByKey(), subtract() and keyBy()] + + 94bb7fd Fri Aug 30 12:05:13 2013 -0700 + Merge pull request #876 from mbautin/master_hadoop_rdd_conf + [Make HadoopRDD's configuration accessible] + + 9e17e45 Fri Aug 30 00:22:53 2013 -0700 + Merge pull request #875 from shivaram/build-fix + [Fix broken build by removing addIntercept] + + 016787d Thu Aug 29 22:15:14 2013 -0700 + Merge pull request #863 from shivaram/etrain-ridge + [Adding linear regression and refactoring Ridge regression to use SGD] + + 852d810 Thu Aug 29 22:13:15 2013 -0700 + Merge pull request #819 from shivaram/sgd-cleanup + [Change SVM to use {0,1} labels] + + ca71620 Thu Aug 29 21:51:14 2013 -0700 + Merge pull request #857 from mateiz/assembly + [Change build and run instructions to use assemblies] + + 1528776 Thu Aug 29 21:30:47 2013 -0700 + Merge pull request #874 from jerryshao/fix-report-bug + [Fix removed block zero size log reporting] + + abdbacf Wed Aug 28 21:11:31 2013 -0700 + Merge pull request #871 from pwendell/expose-local + [Expose `isLocal` in SparkContext.] + + afcade3 Wed Aug 28 20:15:40 2013 -0700 + Merge pull request #873 from pwendell/master + [Hot fix for command runner] + + baa84e7 Wed Aug 28 12:44:46 2013 -0700 + Merge pull request #865 from tgravescs/fixtmpdir + [Spark on Yarn should use yarn approved directories for spark.local.dir and tmp] + + cd043cf Tue Aug 27 19:50:32 2013 -0700 + Merge pull request #867 from tgravescs/yarnenvconfigs + [Spark on Yarn allow users to specify environment variables ] + + 898da7e Mon Aug 26 20:40:49 2013 -0700 + Merge pull request #859 from ianbuss/sbt_opts + [Pass SBT_OPTS environment through to sbt_launcher] + + 17bafea Mon Aug 26 11:59:32 2013 -0700 + Merge pull request #864 from rxin/json1 + [Revert json library change] + + f9fc5c1 Sat Aug 24 15:19:56 2013 -0700 + Merge pull request #603 from pwendell/ec2-updates + [Several Improvements to EC2 Scripts] + + d282c1e Fri Aug 23 11:20:20 2013 -0700 + Merge pull request #860 from jey/sbt-ide-fixes + [Fix IDE project generation under SBT] + + 5a6ac12 Thu Aug 22 22:08:03 2013 -0700 + Merge pull request #701 from ScrapCodes/documentation-suggestions + [Documentation suggestions for spark streaming.] + + 46ea0c1 Thu Aug 22 15:57:28 2013 -0700 + Merge pull request #814 from holdenk/master + [Create less instances of the random class during ALS initialization.] + + 9ac3d62 Thu Aug 22 15:51:10 2013 -0700 + Merge pull request #856 from jey/sbt-fix-hadoop-0.23.9 + [Re-add removed dependency to fix build under Hadoop 0.23.9] + + ae8ba83 Thu Aug 22 10:14:54 2013 -0700 + Merge pull request #855 from jey/update-build-docs + [Update build docs] + + 8a36fd0 Thu Aug 22 10:13:35 2013 -0700 + Merge pull request #854 from markhamstra/pomUpdate + [Synced sbt and maven builds to use the same dependencies, etc.] + + c2d00f1 Thu Aug 22 10:13:03 2013 -0700 + Merge pull request #832 from alig/coalesce + [Coalesced RDD with locality] + + e6d66c8 Wed Aug 21 17:44:31 2013 -0700 + Merge pull request #853 from AndreSchumacher/double_rdd + [Implementing SPARK-838: Add DoubleRDDFunctions methods to PySpark] + + 2905611 Tue Aug 20 17:36:14 2013 -0700 + Merge pull request #851 from markhamstra/MutablePairTE + [Removed meaningless types] + + d61337f Tue Aug 20 10:06:06 2013 -0700 + Merge pull request #844 from markhamstra/priorityRename + [Renamed 'priority' to 'jobId' and assorted minor changes] + + 8cae72e Mon Aug 19 23:40:04 2013 -0700 + Merge pull request #828 from mateiz/sched-improvements + [Scheduler fixes and improvements] + + efeb142 Mon Aug 19 19:23:50 2013 -0700 + Merge pull request #849 from mateiz/web-fixes + [Small fixes to web UI] + + abdc1f8 Mon Aug 19 18:30:56 2013 -0700 + Merge pull request #847 from rxin/rdd + [Allow subclasses of Product2 in all key-value related classes] + + 8fa0747 Sun Aug 18 17:02:54 2013 -0700 + Merge pull request #840 from AndreSchumacher/zipegg + [Implementing SPARK-878 for PySpark: adding zip and egg files to context ...] + + 1e137a5 Sat Aug 17 22:22:32 2013 -0700 + Merge pull request #846 from rxin/rdd + [Two minor RDD refactoring] + + e89ffc7 Fri Aug 16 14:02:34 2013 -0700 + Merge pull request #839 from jegonzal/zip_partitions + [Currying RDD.zipPartitions ] + + 1fb1b09 Thu Aug 15 22:15:05 2013 -0700 + Merge pull request #841 from rxin/json + [Use the JSON formatter from Scala library and removed dependency on lift-json.] + + c69c489 Thu Aug 15 20:55:09 2013 -0700 + Merge pull request #843 from Reinvigorate/bug-879 + [fixing typo in conf/slaves] + + 230ab27 Thu Aug 15 17:45:17 2013 -0700 + Merge pull request #834 from Daemoen/master + [Updated json output to allow for display of worker state] + + 659553b Thu Aug 15 16:56:31 2013 -0700 + Merge pull request #836 from pwendell/rename + [Rename `memoryBytesToString` and `memoryMegabytesToString`] + + 28369ff Thu Aug 15 16:44:02 2013 -0700 + Merge pull request #829 from JoshRosen/pyspark-unit-tests-python-2.6 + [Fix PySpark unit tests on Python 2.6] + + 1a13460 Thu Aug 15 15:50:44 2013 -0700 + Merge pull request #833 from rxin/ui + [Various UI improvements.] + + 044a088 Wed Aug 14 20:43:49 2013 -0700 + Merge pull request #831 from rxin/scheduler + [A few small scheduler / job description changes.] + + 839f2d4 Wed Aug 14 16:17:23 2013 -0700 + Merge pull request #822 from pwendell/ui-features + [Adding GC Stats to TaskMetrics (and three small fixes)] + + 63446f9 Wed Aug 14 00:17:07 2013 -0700 + Merge pull request #826 from kayousterhout/ui_fix + [Fixed 2 bugs in executor UI (incl. SPARK-877)] + + 3f14cba Tue Aug 13 20:09:51 2013 -0700 + Merge pull request #825 from shivaram/maven-repl-fix + [Set SPARK_CLASSPATH for maven repl tests] + + 596adc6 Tue Aug 13 19:41:34 2013 -0700 + Merge pull request #824 from mateiz/mesos-0.12.1 + [Update to Mesos 0.12.1] + + d316af9 Tue Aug 13 15:31:01 2013 -0700 + Merge pull request #821 from pwendell/print-launch-command + [Print run command to stderr rather than stdout] + + 1f79d21 Tue Aug 13 15:23:54 2013 -0700 + Merge pull request #818 from kayousterhout/killed_fix + [Properly account for killed tasks.] + + 622f83c Tue Aug 13 09:58:52 2013 -0700 + Merge pull request #817 from pwendell/pr_784 + [Minor clean-up in metrics servlet code] + + a0133bf Tue Aug 13 09:28:18 2013 -0700 + Merge pull request #784 from jerryshao/dev-metrics-servlet + [Add MetricsServlet for Spark metrics system] + + e2fdac6 Mon Aug 12 21:26:59 2013 -0700 + Merge pull request #802 from stayhf/SPARK-760-Python + [Simple PageRank algorithm implementation in Python for SPARK-760] + + d3525ba Mon Aug 12 21:02:39 2013 -0700 + Merge pull request #813 from AndreSchumacher/add_files_pyspark + [Implementing SPARK-865: Add the equivalent of ADD_JARS to PySpark] + + 9e02da2 Mon Aug 12 20:22:27 2013 -0700 + Merge pull request #812 from shivaram/maven-mllib-tests + [Create SparkContext in beforeAll for MLLib tests] + + 65d0d91 Mon Aug 12 19:00:57 2013 -0700 + Merge pull request #807 from JoshRosen/guava-optional + [Change scala.Option to Guava Optional in Java APIs] + + 4346f0a Mon Aug 12 12:12:12 2013 -0700 + Merge pull request #809 from shivaram/sgd-cleanup + [Clean up scaladoc in ML Lib.] + + ea1b4ba Mon Aug 12 08:09:58 2013 -0700 + Merge pull request #806 from apivovarov/yarn-205 + [Changed yarn.version to 2.0.5 in pom.xml] + + 2a39d2c Sun Aug 11 20:35:09 2013 -0700 + Merge pull request #810 from pwendell/dead_doc_code + [Remove now dead code inside of docs] + + e5b9ed2 Sun Aug 11 17:22:47 2013 -0700 + Merge pull request #808 from pwendell/ui_compressed_bytes + [Report compressed bytes read when calculating TaskMetrics] + + 3796486 Sun Aug 11 14:51:47 2013 -0700 + Merge pull request #805 from woggle/hadoop-rdd-jobconf + [Use new Configuration() instead of slower new JobConf() in SerializableWritable] + + ff9ebfa Sun Aug 11 10:52:55 2013 -0700 + Merge pull request #762 from shivaram/sgd-cleanup + [Refactor SGD options into a new class.] + + 95c62ca Sun Aug 11 10:30:52 2013 -0700 + Merge pull request #804 from apivovarov/master + [Fixed path to JavaALS.java and JavaKMeans.java, fixed hadoop2-yarn profi...] + + 06e4f2a Sat Aug 10 18:06:23 2013 -0700 + Merge pull request #789 from MLnick/master + [Adding Scala version of PageRank example] + + 71c63de Sat Aug 10 10:21:20 2013 -0700 + Merge pull request #795 from mridulm/master + [Fix bug reported in PR 791 : a race condition in ConnectionManager and Connection] + + d17eeb9 Sat Aug 10 09:02:27 2013 -0700 + Merge pull request #785 from anfeng/master + [expose HDFS file system stats via Executor metrics] + + dce5e47 Fri Aug 9 21:53:45 2013 -0700 + Merge pull request #800 from dlyubimov/HBASE_VERSION + [Pull HBASE_VERSION in the head of sbt build] + + cd247ba Fri Aug 9 20:41:13 2013 -0700 + Merge pull request #786 from shivaram/mllib-java + [Java fixes, tests and examples for ALS, KMeans] + + b09d4b7 Fri Aug 9 13:17:08 2013 -0700 + Merge pull request #799 from woggle/sync-fix + [Remove extra synchronization in ResultTask] + + 0bc63bf Fri Aug 9 13:16:25 2013 -0700 + Merge pull request #801 from pwendell/print-launch-command + [Print launch command [Branch 0.8 version]] + + cc6b92e Fri Aug 9 13:00:33 2013 -0700 + Merge pull request #775 from pwendell/print-launch-command + [Log the launch command for Spark daemons] + + f94fc75 Fri Aug 9 10:04:03 2013 -0700 + Merge pull request #788 from shane-huang/sparkjavaopts + [For standalone mode, add worker local env setting of SPARK_JAVA_OPTS as ...] + + 63b6e02 Thu Aug 8 14:02:02 2013 -0700 + Merge pull request #797 from mateiz/chill-0.3.1 + [Update to Chill 0.3.1] + + 9955e5a Thu Aug 8 11:03:38 2013 -0700 + Merge pull request #796 from pwendell/bootstrap-design + [Bootstrap re-design] + + 5133e4b Wed Aug 7 15:50:45 2013 -0700 + Merge pull request #790 from kayousterhout/fix_throughput + [Fixed issue in UI that decreased scheduler throughput by 5x or more] + + 3c8478e Tue Aug 6 23:25:03 2013 -0700 + Merge pull request #747 from mateiz/improved-lr + [Update the Python logistic regression example] + + 6b043a6 Tue Aug 6 22:31:02 2013 -0700 + Merge pull request #724 from dlyubimov/SPARK-826 + [SPARK-826: fold(), reduce(), collect() always attempt to use java serialization] + + de6c4c9 Tue Aug 6 17:09:50 2013 -0700 + Merge pull request #787 from ash211/master + [Update spark-standalone.md] + + df4d10d Tue Aug 6 15:44:05 2013 -0700 + Merge pull request #779 from adatao/adatao-global-SparkEnv + [[HOTFIX] Extend thread safety for SparkEnv.get()] + + d2b0f0c Tue Aug 6 14:49:39 2013 -0700 + Merge pull request #770 from stayhf/SPARK-760-Java + [Simple PageRank algorithm implementation in Java for SPARK-760] + + d031f73 Mon Aug 5 22:33:00 2013 -0700 + Merge pull request #782 from WANdisco/master + [SHARK-94 Log the files computed by HadoopRDD and NewHadoopRDD] + + 1b63dea Mon Aug 5 22:21:26 2013 -0700 + Merge pull request #769 from markhamstra/NegativeCores + [SPARK-847 + SPARK-845: Zombie workers and negative cores] + + 828aff7 Mon Aug 5 21:37:33 2013 -0700 + Merge pull request #776 from gingsmith/master + [adding matrix factorization data generator] + + 8b27789 Mon Aug 5 19:14:52 2013 -0700 + Merge pull request #774 from pwendell/job-description + [Show user-defined job name in UI] + + 550b0cf Mon Aug 5 12:10:32 2013 -0700 + Merge pull request #780 from cybermaster/master + [SPARK-850] + + 22abbc1 Fri Aug 2 16:37:59 2013 -0700 + Merge pull request #772 from karenfeng/ui-843 + [Show app duration] + + 9d7dfd2 Thu Aug 1 17:41:58 2013 -0700 + Merge pull request #743 from pwendell/app-metrics + [Add application metrics to standalone master] + + 6d7afd7 Thu Aug 1 17:13:28 2013 -0700 + Merge pull request #768 from pwendell/pr-695 + [Minor clean-up of fair scheduler UI] + + 5e7b38f Thu Aug 1 14:59:33 2013 -0700 + Merge pull request #695 from xiajunluan/pool_ui + [Enhance job ui in spark ui system with adding pool information] + + 0a96493 Thu Aug 1 11:27:17 2013 -0700 + Merge pull request #760 from karenfeng/heading-update + [Clean up web UI page headers] + + cb7dd86 Thu Aug 1 11:06:10 2013 -0700 + Merge pull request #758 from pwendell/master-json + [Add JSON path to master index page] + + 58756b7 Wed Jul 31 23:45:41 2013 -0700 + Merge pull request #761 from mateiz/kmeans-generator + [Add data generator for K-means] + + ecab635 Wed Jul 31 18:16:55 2013 -0700 + Merge pull request #763 from c0s/assembly + [SPARK-842. Maven assembly is including examples libs and dependencies] + + 39c75f3 Wed Jul 31 15:52:36 2013 -0700 + Merge pull request #757 from BlackNiuza/result_task_generation + [Bug fix: SPARK-837] + + b2b86c2 Wed Jul 31 15:51:39 2013 -0700 + Merge pull request #753 from shivaram/glm-refactor + [Build changes for ML lib] + + 14bf2fe Wed Jul 31 14:18:16 2013 -0700 + Merge pull request #749 from benh/spark-executor-uri + [Added property 'spark.executor.uri' for launching on Mesos.] + + 4ba4c3f Wed Jul 31 13:14:49 2013 -0700 + Merge pull request #759 from mateiz/split-fix + [Use the Char version of split() instead of the String one in MLUtils] + + a386ced Wed Jul 31 11:22:50 2013 -0700 + Merge pull request #754 from rxin/compression + [Compression codec change] + + 0be071a Wed Jul 31 11:11:59 2013 -0700 + Merge pull request #756 from cdshines/patch-1 + [Refactored Vector.apply(length, initializer) replacing excessive code with library method] + + d4556f4 Wed Jul 31 08:48:14 2013 -0700 + Merge pull request #751 from cdshines/master + [Cleaned Partitioner & PythonPartitioner source by taking out non-related logic to Utils] + + 29b8cd3 Tue Jul 30 21:30:33 2013 -0700 + Merge pull request #755 from jerryshao/add-apache-header + [Add Apache license header to metrics system] + + e87de03 Tue Jul 30 15:00:08 2013 -0700 + Merge pull request #744 from karenfeng/bootstrap-update + [Use Bootstrap progress bars in web UI] + + ae57020 Tue Jul 30 14:56:41 2013 -0700 + Merge pull request #752 from rxin/master + [Minor mllib cleanup] + + 8aee118 Tue Jul 30 10:27:54 2013 -0700 + Merge pull request #748 from atalwalkar/master + [made SimpleUpdater consistent with other updaters] + + 468a36c Mon Jul 29 19:44:33 2013 -0700 + Merge pull request #746 from rxin/cleanup + [Internal cleanup] + + 1e1ffb1 Mon Jul 29 19:26:19 2013 -0700 + Merge pull request #745 from shivaram/loss-update-fix + [Remove duplicate loss history in Gradient Descent] + + c99b674 Mon Jul 29 16:32:55 2013 -0700 + Merge pull request #735 from karenfeng/ui-807 + [Totals for shuffle data and CPU time] + + fe7298b Mon Jul 29 14:01:00 2013 -0700 + Merge pull request #741 from pwendell/usability + [Fix two small usability issues] + + c34c0f6 Mon Jul 29 13:18:10 2013 -0700 + Merge pull request #731 from pxinghao/master + [Adding SVM and Lasso] + + f3d72ff Fri Jul 26 17:19:27 2013 -0700 + Merge pull request #739 from markhamstra/toolsPom + [Missing tools/pom.xml scalatest dependency] + + cb36677 Fri Jul 26 16:59:30 2013 -0700 + Merge pull request #738 from harsha2010/pruning + [Fix bug in Partition Pruning.] + + f3cf094 Thu Jul 25 14:53:21 2013 -0700 + Merge pull request #734 from woggle/executor-env2 + [Get more env vars from driver rather than worker] + + 51c2427 Thu Jul 25 00:03:11 2013 -0700 + Merge pull request #732 from ryanlecompte/master + [Refactor Kryo serializer support to use chill/chill-java] + + 52723b9 Wed Jul 24 14:33:02 2013 -0700 + Merge pull request #728 from jey/examples-jar-env + [Fix setting of SPARK_EXAMPLES_JAR] + + 20338c2 Wed Jul 24 14:32:24 2013 -0700 + Merge pull request #729 from karenfeng/ui-811 + [Stage Page updates] + + 5584ebc Wed Jul 24 11:46:46 2013 -0700 + Merge pull request #675 from c0s/assembly + [Building spark assembly for further consumption of the Spark project with a deployed cluster] + + a73f3ee Wed Jul 24 08:59:14 2013 -0700 + Merge pull request #671 from jerryshao/master + [Add metrics system for Spark] + + b011329 Tue Jul 23 22:50:09 2013 -0700 + Merge pull request #727 from rxin/scheduler + [Scheduler code style cleanup.] + + 876125b Tue Jul 23 22:28:21 2013 -0700 + Merge pull request #726 from rxin/spark-826 + [SPARK-829: scheduler shouldn't hang if a task contains unserializable objects in its closure] + + 2f1736c Tue Jul 23 15:53:30 2013 -0700 + Merge pull request #725 from karenfeng/task-start + [Creates task start events] + + 5364f64 Tue Jul 23 13:40:34 2013 -0700 + Merge pull request #723 from rxin/mllib + [Made RegressionModel serializable and added unit tests to make sure predict methods would work.] + + f369e0e Tue Jul 23 13:22:27 2013 -0700 + Merge pull request #720 from ooyala/2013-07/persistent-rdds-api + [Add a public method getCachedRdds to SparkContext] + + 401aac8 Mon Jul 22 16:57:16 2013 -0700 + Merge pull request #719 from karenfeng/ui-808 + [Creates Executors tab for Jobs UI] + + 8ae1436 Mon Jul 22 16:03:04 2013 -0700 + Merge pull request #722 from JoshRosen/spark-825 + [Fix bug: DoubleRDDFunctions.sampleStdev() computed non-sample stdev()] + + 15fb394 Sun Jul 21 10:33:38 2013 -0700 + Merge pull request #716 from c0s/webui-port + [Regression: default webui-port can't be set via command line "--webui-port" anymore] + + c40f0f2 Fri Jul 19 13:33:04 2013 -0700 + Merge pull request #711 from shivaram/ml-generators + [Move ML lib data generator files to util/] + + 413b841 Fri Jul 19 13:31:38 2013 -0700 + Merge pull request #717 from viirya/dev1 + [Do not copy local jars given to SparkContext in yarn mode] + + 0d0a47c Thu Jul 18 12:06:37 2013 -0700 + Merge pull request #710 from shivaram/ml-updates + [Updates to LogisticRegression] + + c6235b5 Thu Jul 18 11:43:48 2013 -0700 + Merge pull request #714 from adatao/master + [[BUGFIX] Fix for sbt/sbt script SPARK_HOME setting] + + 009c79e Thu Jul 18 11:41:52 2013 -0700 + Merge pull request #715 from viirya/dev1 + [fix a bug in build process that pulls in two versions of ASM.] + + 985a9e3 Wed Jul 17 22:27:19 2013 -0700 + Merge pull request #712 from stayhf/SPARK-817 + [Consistently invoke bash with /usr/bin/env bash in scripts to make code ...] + + cad48ed Tue Jul 16 21:41:28 2013 -0700 + Merge pull request #708 from ScrapCodes/dependencies-upgrade + [Dependency upgrade Akka 2.0.3 -> 2.0.5] + + 8a8a8f2 Mon Jul 15 23:09:21 2013 -0700 + Merge pull request #705 from rxin/errormessages + [Throw a more meaningful message when runJob is called to launch tasks on non-existent partitions.] + + ed8415b Mon Jul 15 16:41:04 2013 -0700 + Merge pull request #703 from karenfeng/ui-802 + [Link to job UI from standalone deploy cluster web UI] + + e3d3e6f Mon Jul 15 14:59:44 2013 -0700 + Merge pull request #702 from karenfeng/ui-fixes + [Adds app name in HTML page titles on job web UI] + + c7877d5 Sun Jul 14 12:58:13 2013 -0700 + Merge pull request #689 from BlackNiuza/application_status + [Bug fix: SPARK-796] + + 10c0593 Sun Jul 14 11:45:18 2013 -0700 + Merge pull request #699 from pwendell/ui-env + [Add `Environment` tab to SparkUI.] + + 89e8549 Sat Jul 13 16:11:08 2013 -0700 + Merge pull request #698 from Reinvigorate/sm-deps-change + [changing com.google.code.findbugs maven coordinates] + + 77c69ae Fri Jul 12 23:05:21 2013 -0700 + Merge pull request #697 from pwendell/block-locations + [Show block locations in Web UI.] + + 5a7835c Fri Jul 12 20:28:21 2013 -0700 + Merge pull request #691 from karenfeng/logpaging + [Create log pages] + + 71ccca0 Fri Jul 12 20:25:06 2013 -0700 + Merge pull request #696 from woggle/executor-env + [Pass executor env vars (e.g. SPARK_CLASSPATH) to compute-classpath.sh] + + 90fc3f3 Fri Jul 12 20:23:36 2013 -0700 + Merge pull request #692 from Reinvigorate/takeOrdered + [adding takeOrdered() to RDD] + + 018d04c Thu Jul 11 12:48:37 2013 -0700 + Merge pull request #684 from woggle/mesos-classloader + [Explicitly set class loader for MesosSchedulerDriver callbacks.] + + bc19477 Wed Jul 10 22:29:41 2013 -0700 + Merge pull request #693 from c0s/readme + [Updating README to reflect Scala 2.9.3 requirements] + + 7dcda9a Mon Jul 8 23:24:23 2013 -0700 + Merge pull request #688 from markhamstra/scalaDependencies + [Fixed SPARK-795 with explicit dependencies] + + 638927b Mon Jul 8 22:58:50 2013 -0700 + Merge pull request #683 from shivaram/sbt-test-fix + [Remove some stack traces from sbt test output] + + 3c13178 Mon Jul 8 14:50:34 2013 -0700 + Merge pull request #687 from atalwalkar/master + [Added "Labeled" to util functions for labeled data] + + 744da8e Sun Jul 7 17:42:25 2013 -0700 + Merge pull request #679 from ryanlecompte/master + [Make binSearch method tail-recursive for RidgeRegression] + + 3cc6818 Sat Jul 6 19:51:20 2013 -0700 + Merge pull request #668 from shimingfei/guava-14.0.1 + [update guava version from 11.0.1 to 14.0.1] + + 2216188 Sat Jul 6 16:18:15 2013 -0700 + Merge pull request #676 from c0s/asf-avro + [Use standard ASF published avro module instead of a proprietory built one] + + 94871e4 Sat Jul 6 15:26:19 2013 -0700 + Merge pull request #655 from tgravescs/master + [Add support for running Spark on Yarn on a secure Hadoop Cluster] + + 3f918b3 Sat Jul 6 12:45:18 2013 -0700 + Merge pull request #672 from holdenk/master + [s/ActorSystemImpl/ExtendedActorSystem/ as ActorSystemImpl results in a warning] + + 2a36e54 Sat Jul 6 12:43:21 2013 -0700 + Merge pull request #673 from xiajunluan/master + [Add config template file for fair scheduler feature] + + 7ba7fa1 Sat Jul 6 11:45:08 2013 -0700 + Merge pull request #674 from liancheng/master + [Bug fix: SPARK-789] + + f4416a1 Sat Jul 6 11:41:58 2013 -0700 + Merge pull request #681 from BlackNiuza/memory_leak + [Remove active job from idToActiveJob when job finished or aborted] + + e063e29 Fri Jul 5 21:54:52 2013 -0700 + Merge pull request #680 from tdas/master + [Fixed major performance bug in Network Receiver] + + bf1311e Fri Jul 5 17:32:44 2013 -0700 + Merge pull request #678 from mateiz/ml-examples + [Start of ML package] + + 6ad85d0 Thu Jul 4 21:32:29 2013 -0700 + Merge pull request #677 from jerryshao/fix_stage_clean + [Clean StageToInfos periodically when spark.cleaner.ttl is enabled] + + 2e32fc8 Thu Jul 4 12:18:20 2013 -0700 + Merge pull request #666 from c0s/master + [hbase dependency is missed in hadoop2-yarn profile of examples module +] + + 6d60fe5 Mon Jul 1 18:24:03 2013 -0700 + Merge pull request #666 from c0s/master + [hbase dependency is missed in hadoop2-yarn profile of examples module] + + ccfe953 Sat Jun 29 17:57:53 2013 -0700 + Merge pull request #577 from skumargithub/master + [Example of cumulative counting using updateStateByKey] + + 50ca176 Thu Jun 27 22:24:52 2013 -0700 + Merge pull request #664 from pwendell/test-fix + [Removing incorrect test statement] + + e49bc8c Wed Jun 26 11:13:33 2013 -0700 + Merge pull request #663 from stephenh/option_and_getenv + [Be cute with Option and getenv.] + + f5e32ed Tue Jun 25 09:16:57 2013 -0700 + Merge pull request #661 from mesos/streaming + [Kafka fixes and DStream.count fix for master] + + 1249e91 Mon Jun 24 21:46:33 2013 -0700 + Merge pull request #572 from Reinvigorate/sm-block-interval + [Adding spark.streaming.blockInterval property] + + cfcda95 Mon Jun 24 21:44:50 2013 -0700 + Merge pull request #571 from Reinvigorate/sm-kafka-serializers + [Surfacing decoders on KafkaInputDStream] + + 575aff6 Mon Jun 24 21:35:50 2013 -0700 + Merge pull request #567 from Reinvigorate/sm-count-fix + [Fixing count() in Spark Streaming] + + 3e61bef Sat Jun 22 16:22:47 2013 -0700 + Merge pull request #648 from shivaram/netty-dbg + [Shuffle fixes and cleanup] + + 1ef5d0d Sat Jun 22 09:35:57 2013 -0700 + Merge pull request #644 from shimingfei/joblogger + [add Joblogger to Spark (on new Spark code)] + + 7e4b266 Sat Jun 22 07:53:18 2013 -0700 + Merge pull request #563 from jey/python-optimization + [Optimize PySpark worker invocation] + + 71030ba Wed Jun 19 15:21:03 2013 -0700 + Merge pull request #654 from lyogavin/enhance_pipe + [fix typo and coding style in #638] + + 73f4c7d Tue Jun 18 04:21:17 2013 -0700 + Merge pull request #605 from esjewett/SPARK-699 + [Add hBase example (retry of pull request #596)] + + 9933836 Tue Jun 18 02:41:10 2013 -0700 + Merge pull request #647 from jerryshao/master + [Reduce ZippedPartitionsRDD's getPreferredLocations complexity from O(2^2n) to O(2^n)] + + db42451 Mon Jun 17 15:26:36 2013 -0700 + Merge pull request #643 from adatao/master + [Bug fix: Zero-length partitions result in NaN for overall mean & variance] + + e82a2ff Mon Jun 17 15:13:15 2013 -0700 + Merge pull request #653 from rxin/logging + [SPARK-781: Log the temp directory path when Spark says "Failed to create temp directory."] + + e6d1277 Mon Jun 17 12:56:25 2013 -0700 + Merge pull request #638 from lyogavin/enhance_pipe + [Enhance pipe to support more features we can do in hadoop streaming] + + f961aac Sat Jun 15 00:53:41 2013 -0700 + Merge pull request #649 from ryanlecompte/master + [Add top K method to RDD using a bounded priority queue] + + 6602d94 Fri Jun 14 10:41:31 2013 -0700 + Merge pull request #651 from rxin/groupbykey + [SPARK-772 / SPARK-774: groupByKey and cogroup should disable map side combine] + + d93851a Thu Jun 13 13:38:45 2013 -0700 + Merge pull request #645 from pwendell/compression + [Adding compression to Hadoop save functions] + + f1da591 Wed Jun 12 17:55:08 2013 -0700 + Merge pull request #646 from markhamstra/jvmArgs + [Fixed jvmArgs in maven build.] + + 0e94b73 Mon Jun 10 13:00:31 2013 -0700 + Merge pull request #625 from stephenh/fix-start-slave + [Fix start-slave not passing instance number to spark-daemon.] + + 74b91d5 Sat Jun 8 01:19:40 2013 -0700 + Merge pull request #629 from c0s/master + [Sometime Maven build runs out of PermGen space.] + + c8fc423 Fri Jun 7 22:43:18 2013 -0700 + Merge pull request #631 from jerryshao/master + [Fix block manager UI display issue when enable spark.cleaner.ttl] + + 1ae60bc Fri Jun 7 22:39:06 2013 -0700 + Merge pull request #634 from xiajunluan/master + [[Spark-753] Fix ClusterSchedulSuite unit test failed ] + + fff3728 Tue Jun 4 16:09:50 2013 -0700 + Merge pull request #640 from pwendell/timeout-update + [Fixing bug in BlockManager timeout] + + f420d4f Tue Jun 4 15:25:58 2013 -0700 + Merge pull request #639 from pwendell/timeout-update + [Bump akka and blockmanager timeouts to 60 seconds] + + 84530ba Fri May 31 17:06:13 2013 -0700 + Merge pull request #636 from rxin/unpersist + [Unpersist More block manager cleanup.] + + ef77bb7 Thu May 30 14:50:06 2013 -0700 + Merge pull request #627 from shivaram/master + [Netty and shuffle bug fixes] + + 8cb8178 Thu May 30 14:17:44 2013 -0700 + Merge pull request #628 from shivaram/zero-block-size + [Skip fetching zero-sized blocks in NIO.] + + 6ed7139 Wed May 29 10:14:22 2013 -0700 + Merge pull request #626 from stephenh/remove-add-if-no-port + [Remove unused addIfNoPort.] + + 41d230c Tue May 28 23:35:24 2013 -0700 + Merge pull request #611 from squito/classloader + [Use default classloaders for akka & deserializing task results] + + 3db1e17 Mon May 27 21:31:43 2013 -0700 + Merge pull request #620 from jerryshao/master + [Fix CheckpointRDD java.io.FileNotFoundException when calling getPreferredLocations] + + 3d4891d Sat May 25 23:38:05 2013 -0700 + Merge pull request #621 from JoshRosen/spark-613 + [Use ec2-metadata in start-slave.sh to detect if running on EC2] + + e8d4b6c Sat May 25 21:09:03 2013 -0700 + Merge pull request #529 from xiajunluan/master + [[SPARK-663]Implement Fair Scheduler in Spark Cluster Scheduler ] + + 9a3c344 Sat May 25 17:53:43 2013 -0700 + Merge pull request #624 from rxin/master + [NonJavaSerializableClass should not be Java serializable...] + + 24e41aa Fri May 24 16:48:52 2013 -0700 + Merge pull request #623 from rxin/master + [Automatically configure Netty port.] + + 69161f9 Fri May 24 14:42:13 2013 -0700 + Merge pull request #622 from rxin/master + [bug fix: Shuffle block iterator is ignoring the shuffle serializer setting.] + + dbbedfc Thu May 23 23:11:06 2013 -0700 + Merge pull request #616 from jey/maven-netty-exclusion + [Exclude old versions of Netty from Maven-based build] + + a2b0a79 Tue May 21 18:16:20 2013 -0700 + Merge pull request #619 from woggling/adjust-sampling + [Use ARRAY_SAMPLE_SIZE constant instead of hard-coded 100.0 in SizeEstimator] + + 66dac44 Tue May 21 11:41:42 2013 -0700 + Merge pull request #618 from woggling/dead-code-disttest + [DistributedSuite: remove dead code] + + 5912cc4 Fri May 17 19:58:40 2013 -0700 + Merge pull request #610 from JoshRosen/spark-747 + [Throw exception if TaskResult exceeds Akka frame size] + + 6c27c38 Thu May 16 17:33:56 2013 -0700 + Merge pull request #615 from rxin/build-fix + [Maven build fix & two other small changes] + + 2f576ab Wed May 15 18:06:24 2013 -0700 + Merge pull request #602 from rxin/shufflemerge + [Manual merge & cleanup of Shane's Shuffle Performance Optimization] + + 48c6f46 Wed May 15 10:47:19 2013 -0700 + Merge pull request #612 from ash211/patch-4 + [Docs: Mention spark shell's default for MASTER] + + 203d7b7 Wed May 15 00:47:20 2013 -0700 + Merge pull request #593 from squito/driver_ui_link + [Master UI has link to Application UI] + + 016ac86 Mon May 13 21:45:36 2013 -0700 + Merge pull request #601 from rxin/emptyrdd-master + [EmptyRDD (master branch 0.8)] + + 4b354e0 Mon May 13 17:39:19 2013 -0700 + Merge pull request #589 from mridulm/master + [Add support for instance local scheduling] + + 5dbc9b2 Sun May 12 11:03:10 2013 -0700 + Merge pull request #608 from pwendell/SPARK-738 + [SPARK-738: Spark should detect and wrap nonserializable exceptions] + + 63e1999 Fri May 10 13:54:03 2013 -0700 + Merge pull request #606 from markhamstra/foreachPartition_fix + [Actually use the cleaned closure in foreachPartition] + + 42bbe89 Wed May 8 22:30:31 2013 -0700 + Merge pull request #599 from JoshRosen/spark-670 + [Fix SPARK-670: EC2 'start' command should require -i option.] + + 0f1b7a0 Wed May 8 13:38:50 2013 -0700 + Merge pull request #596 from esjewett/master + [hBase example] + + 7af92f2 Sat May 4 22:29:17 2013 -0700 + Merge pull request #597 from JoshRosen/webui-fixes + [Two minor bug fixes for Spark Web UI] + + c74ce60 Sat May 4 22:26:35 2013 -0700 + Merge pull request #598 from rxin/blockmanager + [Fixed flaky unpersist test in DistributedSuite.] + + 3bf2c86 Fri May 3 18:27:30 2013 -0700 + Merge pull request #594 from shivaram/master + [Add zip partitions to Java API] + + 2484ad7 Fri May 3 17:08:55 2013 -0700 + Merge pull request #587 from rxin/blockmanager + [A set of shuffle map output related changes] + + 6fe9d4e Thu May 2 21:33:56 2013 -0700 + Merge pull request #592 from woggling/localdir-fix + [Don't accept generated local directory names that can't be created] + + 538ee75 Thu May 2 09:01:42 2013 -0700 + Merge pull request #581 from jerryshao/master + [fix [SPARK-740] block manage UI throws exception when enabling Spark Streaming] + + 9abcbcc Wed May 1 22:45:10 2013 -0700 + Merge pull request #591 from rxin/removerdd + [RDD.unpersist: probably the most desired feature of Spark] + + aa8fe1a Tue Apr 30 22:30:18 2013 -0700 + Merge pull request #586 from mridulm/master + [Pull request to address issues Reynold Xin reported] + + f708dda Tue Apr 30 07:51:40 2013 -0700 + Merge pull request #585 from pwendell/listener-perf + [[Fix SPARK-742] Task Metrics should not employ per-record timing by default] + + 68c07ea Sun Apr 28 20:19:33 2013 -0700 + Merge pull request #582 from shivaram/master + [Add zip partitions interface] + + f6ee9a8 Sun Apr 28 15:36:04 2013 -0700 + Merge pull request #583 from mridulm/master + [Fix issues with streaming test cases after yarn branch merge] + + cf54b82 Thu Apr 25 11:45:58 2013 -0700 + Merge pull request #580 from pwendell/quickstart + [SPARK-739 Have quickstart standlone job use README] + + 118a6c7 Wed Apr 24 08:42:30 2013 -0700 + Merge pull request #575 from mridulm/master + [Manual merge of yarn branch to trunk] + + 5d8a71c Tue Apr 16 19:48:02 2013 -0700 + Merge pull request #570 from jey/increase-codecache-size + [Increase ReservedCodeCacheSize for sbt] + + ec5e553 Sun Apr 14 08:20:13 2013 -0700 + Merge pull request #558 from ash211/patch-jackson-conflict + [Don't pull in old versions of Jackson via hadoop-core] + + c1c219e Sun Apr 14 08:11:23 2013 -0700 + Merge pull request #564 from maspotts/master + [Allow latest scala in PATH, with SCALA_HOME as override (instead of vice-versa)] + + 7c10b3e Fri Apr 12 20:55:22 2013 -0700 + Merge pull request #565 from andyk/master + [Update wording of section on RDD operations in quick start guide in docs] + + 077ae0a Thu Apr 11 19:34:14 2013 -0700 + Merge pull request #561 from ash211/patch-4 + [Add details when BlockManager heartbeats time out] + + c91ff8d Wed Apr 10 15:08:23 2013 -0700 + Merge pull request #560 from ash211/patch-3 + [Typos: cluser -> cluster] + + 7cd83bf Tue Apr 9 22:07:35 2013 -0700 + Merge pull request #559 from ash211/patch-example-whitespace + [Uniform whitespace across scala examples] + + 271a4f3 Tue Apr 9 22:04:52 2013 -0700 + Merge pull request #555 from holdenk/master + [Retry failed ssh commands in the ec2 python script.] + + 8ac9efb Tue Apr 9 13:50:50 2013 -0700 + Merge pull request #527 from Reinvigorate/sm-kafka-cleanup + [KafkaInputDStream fixes and improvements] + + eed54a2 Mon Apr 8 09:44:30 2013 -0700 + Merge pull request #553 from pwendell/akka-standalone + [SPARK-724 - Have Akka logging enabled by default for standalone daemons] + + b362df3 Sun Apr 7 17:17:52 2013 -0700 + Merge pull request #552 from MLnick/master + [Bumping version for Twitter Algebird to latest] + + 4b30190 Sun Apr 7 17:15:10 2013 -0700 + Merge pull request #554 from andyk/scala2.9.3 + [Fixes SPARK-723 - Update build to Scala 2.9.3] + + dfe98ca Tue Apr 2 19:24:12 2013 -0700 + Merge pull request #550 from erikvanoosten/master + [corrected Algebird example] + + b5d7830 Tue Apr 2 19:23:45 2013 -0700 + Merge pull request #551 from jey/python-bugfixes + [Python bugfixes] + + 2be2295 Sun Mar 31 18:09:14 2013 -0700 + Merge pull request #548 from markhamstra/getWritableClass_filter + [Fixed broken filter in getWritableClass[T]] + + 9831bc1 Fri Mar 29 22:16:22 2013 -0700 + Merge pull request #539 from cgrothaus/fix-webui-workdirpath + [Bugfix: WorkerWebUI must respect workDirPath from Worker] + + 3cc8ab6 Fri Mar 29 22:14:07 2013 -0700 + Merge pull request #541 from stephenh/shufflecoalesce + [Add a shuffle parameter to coalesce.] + + cad507a Fri Mar 29 22:13:12 2013 -0700 + Merge pull request #547 from jey/maven-streaming-tests-initialization-fix + [Move streaming test initialization into 'before' blocks] + + a98996d Fri Mar 29 22:12:15 2013 -0700 + Merge pull request #545 from ash211/patch-1 + [Don't use deprecated Application in example] + + 104c694 Fri Mar 29 22:11:50 2013 -0700 + Merge pull request #546 from ash211/patch-2 + [Update tuning.md] + + bc36ee4 Tue Mar 26 15:05:13 2013 -0700 + Merge pull request #543 from holdenk/master + [Re-enable deprecation warnings and fix deprecated warning.] + + b8949ca Sat Mar 23 07:19:34 2013 -0700 + Merge pull request #505 from stephenh/volatile + [Make Executor fields volatile since they're read from the thread pool.] + + fd53f2f Sat Mar 23 07:13:21 2013 -0700 + Merge pull request #510 from markhamstra/WithThing + [mapWith, flatMapWith and filterWith] + + 4c5efcf Wed Mar 20 19:29:23 2013 -0700 + Merge pull request #532 from andyk/master + [SPARK-715: Adds instructions for building with Maven to documentation] + + 3558849 Wed Mar 20 19:27:47 2013 -0700 + Merge pull request #538 from rxin/cogroup + [Added mapSideCombine flag to CoGroupedRDD. Added unit test for CoGroupedRDD.] + + ca4d083 Wed Mar 20 11:22:36 2013 -0700 + Merge pull request #528 from MLnick/java-examples + [[SPARK-707] Adding Java versions of Pi, LogQuery and K-Means examples] + + b812e6b Wed Mar 20 11:21:02 2013 -0700 + Merge pull request #526 from markhamstra/foldByKey + [Add foldByKey] + + 945d1e7 Tue Mar 19 21:59:06 2013 -0700 + Merge pull request #536 from sasurfer/master + [CoalescedRDD for many partitions] + + 1cbbe94 Tue Mar 19 21:34:34 2013 -0700 + Merge pull request #534 from stephenh/removetrycatch + [Remove try/catch block that can't be hit.] + + 71e53f8 Tue Mar 19 21:31:41 2013 -0700 + Merge pull request #537 from wishbear/configurableInputFormat + [call setConf from input format if it is Configurable] + + c1e9cdc Sat Mar 16 11:47:45 2013 -0700 + Merge pull request #525 from stephenh/subtractByKey + [Add PairRDDFunctions.subtractByKey.] + + cdbfd1e Fri Mar 15 15:13:28 2013 -0700 + Merge pull request #516 from squito/fix_local_metrics + [Fix local metrics] + + f9fa2ad Fri Mar 15 15:12:43 2013 -0700 + Merge pull request #530 from mbautin/master-update-log4j-and-make-compile-in-IntelliJ + [Add a log4j compile dependency to fix build in IntelliJ] + + 4032beb Wed Mar 13 19:29:46 2013 -0700 + Merge pull request #521 from stephenh/earlyclose + [Close the reader in HadoopRDD as soon as iteration end.] + + 3c97276 Wed Mar 13 19:25:08 2013 -0700 + Merge pull request #524 from andyk/master + [Fix broken link to YARN documentation] + + 1c3d981 Wed Mar 13 19:23:48 2013 -0700 + Merge pull request #517 from Reinvigorate/sm-build-fixes + [Build fixes for streaming /w SBT] + + 2d477fd Wed Mar 13 06:49:16 2013 -0700 + Merge pull request #523 from andyk/master + [Fix broken link in Quick Start] + + 00c4d23 Tue Mar 12 22:19:00 2013 -0700 + Merge pull request #518 from woggling/long-bm-sizes + [Send block sizes as longs in BlockManager updates] + + cbf8f0d Mon Mar 11 00:23:57 2013 -0700 + Merge pull request #513 from MLnick/bagel-caching + [Adds choice of persistence level to Bagel.] + + 91a9d09 Sun Mar 10 15:48:23 2013 -0700 + Merge pull request #512 from patelh/fix-kryo-serializer + [Fix reference bug in Kryo serializer, add test, update version] + + 557cfd0 Sun Mar 10 15:44:57 2013 -0700 + Merge pull request #515 from woggling/deploy-app-death + [Notify standalone deploy client of application death.] + + 04fb81f Sun Mar 3 17:20:07 2013 -0800 + Merge pull request #506 from rxin/spark-706 + [Fixed SPARK-706: Failures in block manager put leads to read task hanging.] + + 6cf4be4 Sun Mar 3 17:16:22 2013 -0800 + Merge pull request #462 from squito/stageInfo + [Track assorted metrics for each task, report summaries to user at stage completion] + + 6bfc7ca Sat Mar 2 22:14:49 2013 -0800 + Merge pull request #504 from mosharaf/master + [Worker address was getting removed when removing an app.] + + 94b3db1 Sat Mar 2 22:13:52 2013 -0800 + Merge pull request #508 from markhamstra/TestServerInUse + [Avoid bind failure in InputStreamsSuite] + + 25c71d3 Fri Mar 1 08:00:18 2013 -0800 + Merge pull request #507 from markhamstra/poms271 + [bump version to 0.7.1-SNAPSHOT in the subproject poms] + diff --git a/README.md b/README.md index c840a68f76b17..dc8135b9b8b51 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # Apache Spark -Lightning-Fast Cluster Computing - +Lightning-Fast Cluster Computing - ## Online Documentation You can find the latest Spark documentation, including a programming -guide, on the project webpage at . +guide, on the project webpage at . This README file only contains basic setup instructions. @@ -92,21 +92,10 @@ If your project is built with Maven, add this to your POM file's ` ## Configuration -Please refer to the [Configuration guide](http://spark.incubator.apache.org/docs/latest/configuration.html) +Please refer to the [Configuration guide](http://spark.apache.org/docs/latest/configuration.html) in the online documentation for an overview on how to configure Spark. -## Apache Incubator Notice - -Apache Spark is an effort undergoing incubation at The Apache Software -Foundation (ASF), sponsored by the Apache Incubator. Incubation is required of -all newly accepted projects until a further review indicates that the -infrastructure, communications, and decision making process have stabilized in -a manner consistent with other successful ASF projects. While incubation status -is not necessarily a reflection of the completeness or stability of the code, -it does indicate that the project has yet to be fully endorsed by the ASF. - - ## Contributing to Spark Contributions via GitHub pull requests are gladly accepted from their original diff --git a/assembly/pom.xml b/assembly/pom.xml index 54a25910ced7d..cdae0f9f5e820 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,14 +21,24 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 0.9.1-incubating-SNAPSHOT ../pom.xml org.apache.spark spark-assembly_2.10 Spark Project Assembly - http://spark.incubator.apache.org/ + http://spark.apache.org/ + pom + + + scala-${scala.binary.version} + ${project.artifactId}-${project.version}-hadoop${hadoop.version}.jar + ${project.build.directory}/${spark.jar.dir}/${spark.jar.basename} + spark + /usr/share/spark + root + @@ -64,6 +74,11 @@ spark-streaming_${scala.binary.version} ${project.version} + + org.apache.spark + spark-graphx_${scala.binary.version} + ${project.version} + net.sf.py4j py4j @@ -79,7 +94,7 @@ maven-shade-plugin false - ${project.build.directory}/scala-${scala.binary.version}/${project.artifactId}-${project.version}-hadoop${hadoop.version}.jar + ${spark.jar} *:* @@ -143,6 +158,16 @@ + + spark-ganglia-lgpl + + + org.apache.spark + spark-ganglia-lgpl_${scala.binary.version} + ${project.version} + + + bigtop-dist + {% endproduction %} @@ -82,6 +82,17 @@
  • MLlib (Machine Learning)
  • Bagel (Pregel on Spark)
  • GraphX (Graph Processing)
  • +
  • + @@ -148,16 +159,6 @@

    Heading


    --> -
    -
    -

    - Apache Spark is an effort undergoing incubation at the Apache Software Foundation. - - - -

    -
    - diff --git a/docs/_plugins/copy_api_dirs.rb b/docs/_plugins/copy_api_dirs.rb index acc6bf08160eb..44d64057f4fb3 100644 --- a/docs/_plugins/copy_api_dirs.rb +++ b/docs/_plugins/copy_api_dirs.rb @@ -20,7 +20,10 @@ if not (ENV['SKIP_API'] == '1' or ENV['SKIP_SCALADOC'] == '1') # Build Scaladoc for Java/Scala - projects = ["core", "examples", "repl", "bagel", "graphx", "streaming", "mllib"] + core_projects = ["core", "examples", "repl", "bagel", "graphx", "streaming", "mllib"] + external_projects = ["flume", "kafka", "mqtt", "twitter", "zeromq"] + + projects = core_projects + external_projects.map { |project_name| "external/" + project_name } puts "Moving to project root and building scaladoc." curr_dir = pwd diff --git a/docs/_plugins/production_tag.rb b/docs/_plugins/production_tag.rb new file mode 100644 index 0000000000000..9f870cf2137af --- /dev/null +++ b/docs/_plugins/production_tag.rb @@ -0,0 +1,14 @@ +module Jekyll + class ProductionTag < Liquid::Block + + def initialize(tag_name, markup, tokens) + super + end + + def render(context) + if ENV['PRODUCTION'] then super else "" end + end + end +end + +Liquid::Template.register_tag('production', Jekyll::ProductionTag) diff --git a/docs/bagel-programming-guide.md b/docs/bagel-programming-guide.md index cffa55ee952b0..da6d0c9dcd97b 100644 --- a/docs/bagel-programming-guide.md +++ b/docs/bagel-programming-guide.md @@ -16,7 +16,7 @@ This guide shows the programming model and features of Bagel by walking through To use Bagel in your program, add the following SBT or Maven dependency: groupId = org.apache.spark - artifactId = spark-bagel_{{site.SCALA_VERSION}} + artifactId = spark-bagel_{{site.SCALA_BINARY_VERSION}} version = {{site.SPARK_VERSION}} # Programming Model @@ -108,7 +108,7 @@ _Example_ ## Operations -Here are the actions and types in the Bagel API. See [Bagel.scala](https://github.com/apache/incubator-spark/blob/master/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala) for details. +Here are the actions and types in the Bagel API. See [Bagel.scala](https://github.com/apache/spark/blob/master/bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala) for details. ### Actions diff --git a/docs/building-with-maven.md b/docs/building-with-maven.md index b9ff0af76f647..ded12926885b9 100644 --- a/docs/building-with-maven.md +++ b/docs/building-with-maven.md @@ -17,10 +17,10 @@ You'll need to configure Maven to use more memory than usual by setting `MAVEN_O If you don't run this, you may see errors like the following: - [INFO] Compiling 203 Scala sources and 9 Java sources to /Users/me/Development/spark/core/target/scala-{{site.SCALA_VERSION}}/classes... + [INFO] Compiling 203 Scala sources and 9 Java sources to /Users/me/Development/spark/core/target/scala-{{site.SCALA_BINARY_VERSION}}/classes... [ERROR] PermGen space -> [Help 1] - [INFO] Compiling 203 Scala sources and 9 Java sources to /Users/me/Development/spark/core/target/scala-{{site.SCALA_VERSION}}/classes... + [INFO] Compiling 203 Scala sources and 9 Java sources to /Users/me/Development/spark/core/target/scala-{{site.SCALA_BINARY_VERSION}}/classes... [ERROR] Java heap space -> [Help 1] You can fix this by setting the `MAVEN_OPTS` variable as discussed before. @@ -71,8 +71,8 @@ This setup works fine in IntelliJ IDEA 11.1.4. After opening the project via the ## Building Spark Debian Packages ## -It includes support for building a Debian package containing a 'fat-jar' which includes the repl, the examples and bagel. This can be created by specifying the following profiles: +The maven build includes support for building a Debian package containing the assembly 'fat-jar', PySpark, and the necessary scripts and configuration files. This can be created by specifying the following: - $ mvn -Prepl-bin -Pdeb clean package + $ mvn -Pdeb -DskipTests clean package -The debian package can then be found under repl/target. We added the short commit hash to the file name so that we can distinguish individual packages build for SNAPSHOT versions. +The debian package can then be found under assembly/target. We added the short commit hash to the file name so that we can distinguish individual packages built for SNAPSHOT versions. diff --git a/docs/configuration.md b/docs/configuration.md index da70cabba2d9b..3f03d97e8054c 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -18,8 +18,8 @@ Spark provides three locations to configure the system: Spark properties control most application settings and are configured separately for each application. The preferred way to set them is by passing a [SparkConf](api/core/index.html#org.apache.spark.SparkConf) class to your SparkContext constructor. -Alternatively, Spark will also load them from Java system properties (for compatibility with old versions -of Spark) and from a [`spark.conf` file](#configuration-files) on your classpath. +Alternatively, Spark will also load them from Java system properties, for compatibility with old versions +of Spark. SparkConf lets you configure most of the common properties to initialize a cluster (e.g., master URL and application name), as well as arbitrary key-value pairs through the `set()` method. For example, we could @@ -98,7 +98,7 @@ Apart from these, the following properties are also available, and may be useful spark.default.parallelism 8 - Default number of tasks to use for distributed shuffle operations (groupByKey, + Default number of tasks to use across the cluster for distributed shuffle operations (groupByKey, reduceByKey, etc) when not set by user. @@ -360,7 +360,16 @@ Apart from these, the following properties are also available, and may be useful spark.streaming.blockInterval 200 - Duration (milliseconds) of how long to batch new objects coming from network receivers. + Duration (milliseconds) of how long to batch new objects coming from network receivers used + in Spark Streaming. + + + + spark.streaming.unpersist + false + + Force RDDs generated and persisted by Spark Streaming to be automatically unpersisted from + Spark's memory. Setting this to true is likely to reduce Spark's RDD memory usage. @@ -379,13 +388,6 @@ Apart from these, the following properties are also available, and may be useful Too large a value decreases parallelism during broadcast (makes it slower); however, if it is too small, BlockManager might take a performance hit. - - akka.x.y.... - value - - An arbitrary akka configuration can be set directly on spark conf and it is applied for all the ActorSystems created spark wide for that SparkContext and its assigned executors as well. - - spark.shuffle.consolidateFiles @@ -394,6 +396,14 @@ Apart from these, the following properties are also available, and may be useful If set to "true", consolidates intermediate files created during a shuffle. Creating fewer files can improve filesystem performance for shuffles with large numbers of reduce tasks. It is recommended to set this to "true" when using ext4 or xfs filesystems. On ext3, this option might degrade performance on machines with many (>8) cores due to filesystem limitations. + + spark.shuffle.file.buffer.kb + 100 + + Size of the in-memory buffer for each shuffle file output stream, in kilobytes. These buffers + reduce the number of disk seeks and system calls made in creating intermediate shuffle files. + + spark.shuffle.spill true @@ -468,30 +478,6 @@ Apart from these, the following properties are also available, and may be useful The application web UI at `http://:4040` lists Spark properties in the "Environment" tab. This is a useful place to check to make sure that your properties have been set correctly. -## Configuration Files - -You can also configure Spark properties through a `spark.conf` file on your Java classpath. -Because these properties are usually application-specific, we recommend putting this fine *only* on your -application's classpath, and not in a global Spark classpath. - -The `spark.conf` file uses Typesafe Config's [HOCON format](https://github.com/typesafehub/config#json-superset), -which is a superset of Java properties files and JSON. For example, the following is a simple config file: - -{% highlight awk %} -# Comments are allowed -spark.executor.memory = 512m -spark.serializer = org.apache.spark.serializer.KryoSerializer -{% endhighlight %} - -The format also allows hierarchical nesting, as follows: - -{% highlight awk %} -spark.akka { - threads = 8 - timeout = 200 -} -{% endhighlight %} - # Environment Variables Certain Spark settings can be configured through environment variables, which are read from the `conf/spark-env.sh` diff --git a/docs/css/main.css b/docs/css/main.css index 31122d5633801..8566400f071c9 100755 --- a/docs/css/main.css +++ b/docs/css/main.css @@ -87,20 +87,54 @@ a:hover code { max-width: 914px; } -/** - * Make dropdown menus in nav bars show on hover instead of click - * using solution at http://stackoverflow.com/questions/8878033/how- - * to-make-twitter-bootstrap-menu-dropdown-on-hover-rather-than-click - **/ .dropdown-menu { /* Remove the default 2px top margin which causes a small gap between the hover trigger area and the popup menu */ margin-top: 0; + /* Avoid too much whitespace at the right for shorter menu items */ + min-width: 50px; } + +/** + * Make dropdown menus in nav bars show on hover instead of click + * using solution at http://stackoverflow.com/questions/8878033/how- + * to-make-twitter-bootstrap-menu-dropdown-on-hover-rather-than-click + **/ ul.nav li.dropdown:hover ul.dropdown-menu{ display: block; } + a.menu:after, .dropdown-toggle:after { content: none; } +/** Make the submenus open on hover on the parent menu item */ +ul.nav li.dropdown ul.dropdown-menu li.dropdown-submenu:hover ul.dropdown-menu { + display: block; +} + +/** Make the submenus be invisible until the parent menu item is hovered upon */ +ul.nav li.dropdown ul.dropdown-menu li.dropdown-submenu ul.dropdown-menu { + display: none; +} + +/** + * Made the navigation bar buttons not grey out when clicked. + * Essentially making nav bar buttons not react to clicks, only hover events. + */ +.navbar .nav li.dropdown.open > .dropdown-toggle { + background-color: transparent; +} + +/** + * Made the active tab caption blue. Otherwise the active tab is black, and inactive tab is blue. + * That looks weird. Changed the colors to active - blue, inactive - black, and + * no color change on hover. + */ +.nav-tabs > .active > a, .nav-tabs > .active > a:hover { + color: #08c; +} + +.nav-tabs > li > a, .nav-tabs > li > a:hover { + color: #333; +} diff --git a/docs/graphx-programming-guide.md b/docs/graphx-programming-guide.md index 9fbde4eb09575..3dfed7bea9ea8 100644 --- a/docs/graphx-programming-guide.md +++ b/docs/graphx-programming-guide.md @@ -18,7 +18,7 @@ title: GraphX Programming Guide GraphX is the new (alpha) Spark API for graphs and graph-parallel computation. At a high-level, GraphX extends the Spark [RDD](api/core/index.html#org.apache.spark.rdd.RDD) by introducing the -[Resilient Distributed property Graph (RDG)](#property_graph): a directed multigraph with properties +[Resilient Distributed Property Graph](#property_graph): a directed multigraph with properties attached to each vertex and edge. To support graph computation, GraphX exposes a set of fundamental operators (e.g., [subgraph](#structural_operators), [joinVertices](#join_operators), and [mapReduceTriplets](#mrTriplets)) as well as an optimized variant of the [Pregel](#pregel) API. In @@ -29,7 +29,7 @@ addition, GraphX includes a growing collection of graph [algorithms](#graph_algo From social networks to language modeling, the growing scale and importance of graph data has driven the development of numerous new *graph-parallel* systems -(e.g., [Giraph](http://http://giraph.apache.org) and +(e.g., [Giraph](http://giraph.apache.org) and [GraphLab](http://graphlab.org)). By restricting the types of computation that can be expressed and introducing new techniques to partition and distribute graphs, these systems can efficiently execute sophisticated graph algorithms orders of @@ -43,12 +43,25 @@ magnitude faster than more general *data-parallel* systems.

    -However, the same restrictions that enable these substantial performance gains -also make it difficult to express many of the important stages in a typical graph-analytics pipeline: -constructing the graph, modifying its structure, or expressing computation that -spans multiple graphs. As a consequence, existing graph analytics pipelines -compose graph-parallel and data-parallel systems, leading to extensive data -movement and duplication and a complicated programming model. +However, the same restrictions that enable these substantial performance gains also make it +difficult to express many of the important stages in a typical graph-analytics pipeline: +constructing the graph, modifying its structure, or expressing computation that spans multiple +graphs. Furthermore, how we look at data depends on our objectives and the same raw data may have +many different table and graph views. + +

    + Tables and Graphs + +

    + +As a consequence, it is often necessary to be able to move between table and graph views of the same +physical data and to leverage the properties of each view to easily and efficiently express +computation. However, existing graph analytics pipelines must compose graph-parallel and data- +parallel systems, leading to extensive data movement and duplication and a complicated programming +model.

    GraphX optimizes the representation of `VD` and `ED` when they are plain old data-types (e.g., -> int, double, etc...) reducing the in memory footprint. +> GraphX optimizes the representation of vertex and edge types when they are plain old data-types +> (e.g., int, double, etc...) reducing the in memory footprint by storing them in specialized +> arrays. -In some cases we may wish to have vertices with different property types in the same graph. This can -be accomplished through inheritance. For example to model users and products as a bipartite graph -we might do the following: +In some cases it may be desirable to have vertices with different property types in the same graph. +This can be accomplished through inheritance. For example to model users and products as a +bipartite graph we might do the following: {% highlight scala %} class VertexProperty() @@ -116,9 +132,11 @@ var graph: Graph[VertexProperty, String] = null {% endhighlight %} Like RDDs, property graphs are immutable, distributed, and fault-tolerant. Changes to the values or -structure of the graph are accomplished by producing a new graph with the desired changes. The graph -is partitioned across the workers using a range of vertex-partitioning heuristics. As with RDDs, -each partition of the graph can be recreated on a different machine in the event of a failure. +structure of the graph are accomplished by producing a new graph with the desired changes. Note +that substantial parts of the original graph (i.e., unaffected structure, attributes, and indicies) +are reused in the new graph reducing the cost of this inherently functional data-structure. The +graph is partitioned across the workers using a range of vertex-partitioning heuristics. As with +RDDs, each partition of the graph can be recreated on a different machine in the event of a failure. Logically the property graph corresponds to a pair of typed collections (RDDs) encoding the properties for each vertex and edge. As a consequence, the graph class contains members to access @@ -131,12 +149,12 @@ class Graph[VD, ED] { } {% endhighlight %} -The classes `VertexRDD[VD]` and `EdgeRDD[ED]` extend and are optimized versions of `RDD[(VertexId, +The classes `VertexRDD[VD]` and `EdgeRDD[ED]` extend and are optimized versions of `RDD[(VertexID, VD)]` and `RDD[Edge[ED]]` respectively. Both `VertexRDD[VD]` and `EdgeRDD[ED]` provide additional functionality built around graph computation and leverage internal optimizations. We discuss the `VertexRDD` and `EdgeRDD` API in greater detail in the section on [vertex and edge RDDs](#vertex_and_edge_rdds) but for now they can be thought of as simply RDDs of the form: -`RDD[(VertexId, VD)]` and `RDD[Edge[ED]]`. +`RDD[(VertexID, VD)]` and `RDD[Edge[ED]]`. ### Example Property Graph @@ -168,7 +186,7 @@ code constructs a graph from a collection of RDDs: // Assume the SparkContext has already been constructed val sc: SparkContext // Create an RDD for the vertices -val users: RDD[(VertexID, (String, String))] = +val users: RDD[(VertexId, (String, String))] = sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")), (5L, ("franklin", "prof")), (2L, ("istoica", "prof")))) // Create an RDD for edges @@ -183,7 +201,7 @@ val graph = Graph(users, relationships, defaultUser) In the above example we make use of the [`Edge`][Edge] case class. Edges have a `srcId` and a `dstId` corresponding to the source and destination vertex identifiers. In addition, the `Edge` -class contains the `attr` member which contains the edge property. +class has an `attr` member which stores the edge property. [Edge]: api/graphx/index.html#org.apache.spark.graphx.Edge @@ -199,7 +217,7 @@ graph.edges.filter(e => e.srcId > e.dstId).count {% endhighlight %} > Note that `graph.vertices` returns an `VertexRDD[(String, String)]` which extends -> `RDD[(VertexId, (String, String))]` and so we use the scala `case` expression to deconstruct the +> `RDD[(VertexID, (String, String))]` and so we use the scala `case` expression to deconstruct the > tuple. On the other hand, `graph.edges` returns an `EdgeRDD` containing `Edge[String]` objects. > We could have also used the case class type constructor as in the following: > {% highlight scala %} @@ -266,6 +284,75 @@ able to support different graph representations in the future. Each graph repre provide implementations of the core operations and reuse many of the useful operations defined in [`GraphOps`][GraphOps]. +### Summary List of Operators +The following is a quick summary of the functionality defined in both [`Graph`][Graph] and +[`GraphOps`][GraphOps] but presented as members of Graph for simplicity. Note that some function +signatures have been simplified (e.g., default arguments and type constraints removed) and some more +advanced functionality has been removed so please consult the API docs for the official list of +operations. + +{% highlight scala %} +/** Summary of the functionality in the property graph */ +class Graph[VD, ED] { + // Information about the Graph =================================================================== + val numEdges: Long + val numVertices: Long + val inDegrees: VertexRDD[Int] + val outDegrees: VertexRDD[Int] + val degrees: VertexRDD[Int] + // Views of the graph as collections ============================================================= + val vertices: VertexRDD[VD] + val edges: EdgeRDD[ED] + val triplets: RDD[EdgeTriplet[VD, ED]] + // Functions for caching graphs ================================================================== + def persist(newLevel: StorageLevel = StorageLevel.MEMORY_ONLY): Graph[VD, ED] + def cache(): Graph[VD, ED] + def unpersistVertices(blocking: Boolean = true): Graph[VD, ED] + // Change the partitioning heuristic ============================================================ + def partitionBy(partitionStrategy: PartitionStrategy): Graph[VD, ED] + // Transform vertex and edge attributes ========================================================== + def mapVertices[VD2](map: (VertexID, VD) => VD2): Graph[VD2, ED] + def mapEdges[ED2](map: Edge[ED] => ED2): Graph[VD, ED2] + def mapEdges[ED2](map: (PartitionID, Iterator[Edge[ED]]) => Iterator[ED2]): Graph[VD, ED2] + def mapTriplets[ED2](map: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2] + def mapTriplets[ED2](map: (PartitionID, Iterator[EdgeTriplet[VD, ED]]) => Iterator[ED2]) + : Graph[VD, ED2] + // Modify the graph structure ==================================================================== + def reverse: Graph[VD, ED] + def subgraph( + epred: EdgeTriplet[VD,ED] => Boolean = (x => true), + vpred: (VertexID, VD) => Boolean = ((v, d) => true)) + : Graph[VD, ED] + def mask[VD2, ED2](other: Graph[VD2, ED2]): Graph[VD, ED] + def groupEdges(merge: (ED, ED) => ED): Graph[VD, ED] + // Join RDDs with the graph ====================================================================== + def joinVertices[U](table: RDD[(VertexID, U)])(mapFunc: (VertexID, VD, U) => VD): Graph[VD, ED] + def outerJoinVertices[U, VD2](other: RDD[(VertexID, U)]) + (mapFunc: (VertexID, VD, Option[U]) => VD2) + : Graph[VD2, ED] + // Aggregate information about adjacent triplets ================================================= + def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexID]] + def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[Array[(VertexID, VD)]] + def mapReduceTriplets[A: ClassTag]( + mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexID, A)], + reduceFunc: (A, A) => A, + activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None) + : VertexRDD[A] + // Iterative graph-parallel computation ========================================================== + def pregel[A](initialMsg: A, maxIterations: Int, activeDirection: EdgeDirection)( + vprog: (VertexID, VD, A) => VD, + sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexID,A)], + mergeMsg: (A, A) => A) + : Graph[VD, ED] + // Basic graph algorithms ======================================================================== + def pageRank(tol: Double, resetProb: Double = 0.15): Graph[Double, Double] + def connectedComponents(): Graph[VertexID, ED] + def triangleCount(): Graph[Int, ED] + def stronglyConnectedComponents(numIter: Int): Graph[VertexID, ED] +} +{% endhighlight %} + + ## Property Operators In direct analogy to the RDD `map` operator, the property @@ -273,7 +360,7 @@ graph contains the following: {% highlight scala %} class Graph[VD, ED] { - def mapVertices[VD2](map: (VertexID, VD) => VD2): Graph[VD2, ED] + def mapVertices[VD2](map: (VertexId, VD) => VD2): Graph[VD2, ED] def mapEdges[ED2](map: Edge[ED] => ED2): Graph[VD, ED2] def mapTriplets[ED2](map: EdgeTriplet[VD, ED] => ED2): Graph[VD, ED2] } @@ -295,7 +382,7 @@ val newGraph = Graph(newVertices, graph.edges) val newGraph = graph.mapVertices((id, attr) => mapUdf(id, attr)) {% endhighlight %} -[Graph.mapVertices]: api/graphx/index.html#org.apache.spark.graphx.Graph@mapVertices[VD2]((VertexID,VD)ā‡’VD2)(ClassTag[VD2]):Graph[VD2,ED] +[Graph.mapVertices]: api/graphx/index.html#org.apache.spark.graphx.Graph@mapVertices[VD2]((VertexId,VD)ā‡’VD2)(ClassTag[VD2]):Graph[VD2,ED] These operators are often used to initialize the graph for a particular computation or project away unnecessary properties. For example, given a graph with the out-degrees as the vertex properties @@ -321,7 +408,7 @@ add more in the future. The following is a list of the basic structural operato class Graph[VD, ED] { def reverse: Graph[VD, ED] def subgraph(epred: EdgeTriplet[VD,ED] => Boolean, - vpred: (VertexID, VD) => Boolean): Graph[VD, ED] + vpred: (VertexId, VD) => Boolean): Graph[VD, ED] def mask[VD2, ED2](other: Graph[VD2, ED2]): Graph[VD, ED] def groupEdges(merge: (ED, ED) => ED): Graph[VD,ED] } @@ -340,11 +427,11 @@ satisfy the edge predicate *and connect vertices that satisfy the vertex predica operator can be used in number of situations to restrict the graph to the vertices and edges of interest or eliminate broken links. For example in the following code we remove broken links: -[Graph.subgraph]: api/graphx/index.html#org.apache.spark.graphx.Graph@subgraph((EdgeTriplet[VD,ED])ā‡’Boolean,(VertexID,VD)ā‡’Boolean):Graph[VD,ED] +[Graph.subgraph]: api/graphx/index.html#org.apache.spark.graphx.Graph@subgraph((EdgeTriplet[VD,ED])ā‡’Boolean,(VertexId,VD)ā‡’Boolean):Graph[VD,ED] {% highlight scala %} // Create an RDD for the vertices -val users: RDD[(VertexID, (String, String))] = +val users: RDD[(VertexId, (String, String))] = sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")), (5L, ("franklin", "prof")), (2L, ("istoica", "prof")), (4L, ("peter", "student")))) @@ -407,9 +494,9 @@ using the *join* operators. Below we list the key join operators: {% highlight scala %} class Graph[VD, ED] { - def joinVertices[U](table: RDD[(VertexID, U)])(map: (VertexID, VD, U) => VD) + def joinVertices[U](table: RDD[(VertexId, U)])(map: (VertexId, VD, U) => VD) : Graph[VD, ED] - def outerJoinVertices[U, VD2](table: RDD[(VertexID, U)])(map: (VertexID, VD, Option[U]) => VD2) + def outerJoinVertices[U, VD2](table: RDD[(VertexId, U)])(map: (VertexId, VD, Option[U]) => VD2) : Graph[VD2, ED] } {% endhighlight %} @@ -419,13 +506,13 @@ returns a new graph with the vertex properties obtained by applying the user def to the result of the joined vertices. Vertices without a matching value in the RDD retain their original value. -[GraphOps.joinVertices]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@joinVertices[U](RDD[(VertexID,U)])((VertexID,VD,U)ā‡’VD)(ClassTag[U]):Graph[VD,ED] +[GraphOps.joinVertices]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@joinVertices[U](RDD[(VertexId,U)])((VertexId,VD,U)ā‡’VD)(ClassTag[U]):Graph[VD,ED] > Note that if the RDD contains more than one value for a given vertex only one will be used. It > is therefore recommended that the input RDD be first made unique using the following which will > also *pre-index* the resulting values to substantially accelerate the subsequent join. > {% highlight scala %} -val nonUniqueCosts: RDD[(VertexId, Double)] +val nonUniqueCosts: RDD[(VertexID, Double)] val uniqueCosts: VertexRDD[Double] = graph.vertices.aggregateUsingIndex(nonUnique, (a,b) => a + b) val joinedGraph = graph.joinVertices(uniqueCosts)( @@ -438,7 +525,7 @@ property type. Because not all vertices may have a matching value in the input function takes an `Option` type. For example, we can setup a graph for PageRank by initializing vertex properties with their `outDegree`. -[Graph.outerJoinVertices]: api/graphx/index.html#org.apache.spark.graphx.Graph@outerJoinVertices[U,VD2](RDD[(VertexID,U)])((VertexID,VD,Option[U])ā‡’VD2)(ClassTag[U],ClassTag[VD2]):Graph[VD2,ED] +[Graph.outerJoinVertices]: api/graphx/index.html#org.apache.spark.graphx.Graph@outerJoinVertices[U,VD2](RDD[(VertexId,U)])((VertexId,VD,Option[U])ā‡’VD2)(ClassTag[U],ClassTag[VD2]):Graph[VD2,ED] {% highlight scala %} @@ -457,7 +544,7 @@ val degreeGraph = graph.outerJoinVertices(outDegrees) { (id, oldAttr, outDegOpt) > provide type annotation for the user defined function: > {% highlight scala %} val joinedGraph = graph.joinVertices(uniqueCosts, - (id: VertexId, oldCost: Double, extraCost: Double) => oldCost + extraCost) + (id: VertexID, oldCost: Double, extraCost: Double) => oldCost + extraCost) {% endhighlight %} @@ -472,7 +559,7 @@ PageRank Value, shortest path to the source, and smallest reachable vertex id). ### Map Reduce Triplets (mapReduceTriplets) -[Graph.mapReduceTriplets]: api/graphx/index.html#org.apache.spark.graphx.Graph@mapReduceTriplets[A](mapFunc:org.apache.spark.graphx.EdgeTriplet[VD,ED]=>Iterator[(org.apache.spark.graphx.VertexID,A)],reduceFunc:(A,A)=>A,activeSetOpt:Option[(org.apache.spark.graphx.VertexRDD[_],org.apache.spark.graphx.EdgeDirection)])(implicitevidence$10:scala.reflect.ClassTag[A]):org.apache.spark.graphx.VertexRDD[A] +[Graph.mapReduceTriplets]: api/graphx/index.html#org.apache.spark.graphx.Graph@mapReduceTriplets[A](mapFunc:org.apache.spark.graphx.EdgeTriplet[VD,ED]=>Iterator[(org.apache.spark.graphx.VertexId,A)],reduceFunc:(A,A)=>A,activeSetOpt:Option[(org.apache.spark.graphx.VertexRDD[_],org.apache.spark.graphx.EdgeDirection)])(implicitevidence$10:scala.reflect.ClassTag[A]):org.apache.spark.graphx.VertexRDD[A] The core (heavily optimized) aggregation primitive in GraphX is the [`mapReduceTriplets`][Graph.mapReduceTriplets] operator: @@ -480,7 +567,7 @@ The core (heavily optimized) aggregation primitive in GraphX is the {% highlight scala %} class Graph[VD, ED] { def mapReduceTriplets[A]( - map: EdgeTriplet[VD, ED] => Iterator[(VertexID, A)], + map: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)], reduce: (A, A) => A) : VertexRDD[A] } @@ -495,26 +582,26 @@ containing the aggregate message (of type `A`) destined to each vertex. Vertice receive a message are not included in the returned `VertexRDD`.

    -

    -Note that mapReduceTriplets takes an additional optional activeSet -(see API docs) which restricts the map phase to edges adjacent to the vertices in the provided -VertexRDD: -

    + +

    Note that mapReduceTriplets takes an additional optional activeSet +(not shown above see API docs for details) which restricts the map phase to edges adjacent to the +vertices in the provided VertexRDD:

    + {% highlight scala %} activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None {% endhighlight %} -

    -The EdgeDirection specifies which edges adjacent to the vertex set are included in the map phase. If -the direction is In, mapFunc will only be run only on edges with -destination in the active set. If the direction is Out, mapFunc will only -be run only on edges originating from vertices in the active set. If the direction is -Either, mapFunc will be run only on edges with either vertex in the -active set. If the direction is Both, mapFunc will be run only on edges -with both vertices in the active set. The active set must be derived from the set of vertices in -the graph. Restricting computation to triplets adjacent to a subset of the vertices is often -necessary in incremental iterative computation and is a key part of the GraphX implementation of -Pregel. -

    + +

    The EdgeDirection specifies which edges adjacent to the vertex set are included in the map +phase. If the direction is In, then the user defined map function will +only be run only on edges with the destination vertex in the active set. If the direction is +Out, then the map function will only be run only on edges originating from +vertices in the active set. If the direction is Either, then the map +function will be run only on edges with either vertex in the active set. If the direction is +Both, then the map function will be run only on edges with both vertices +in the active set. The active set must be derived from the set of vertices in the graph. +Restricting computation to triplets adjacent to a subset of the vertices is often necessary in +incremental iterative computation and is a key part of the GraphX implementation of Pregel.

    +
    In the following example we use the `mapReduceTriplets` operator to compute the average age of the @@ -547,8 +634,8 @@ val avgAgeOfOlderFollowers: VertexRDD[Double] = avgAgeOfOlderFollowers.collect.foreach(println(_)) {% endhighlight %} -> Note that the `mapReduceTriplets` operation performs optimally when the messages (and their sums) -> are constant sized (e.g., floats and addition instead of lists and concatenation). More +> Note that the `mapReduceTriplets` operation performs optimally when the messages (and the sums of +> messages) are constant sized (e.g., floats and addition instead of lists and concatenation). More > precisely, the result of `mapReduceTriplets` should ideally be sub-linear in the degree of each > vertex. @@ -562,13 +649,13 @@ compute the max in, out, and total degrees: {% highlight scala %} // Define a reduce operation to compute the highest degree vertex -def max(a: (VertexID, Int), b: (VertexID, Int)): (VertexID, Int) = { +def max(a: (VertexId, Int), b: (VertexId, Int)): (VertexId, Int) = { if (a._2 > b._2) a else b } // Compute the max degrees -val maxInDegree: (VertexID, Int) = graph.inDegrees.reduce(max) -val maxOutDegree: (VertexID, Int) = graph.outDegrees.reduce(max) -val maxDegrees: (VertexID, Int) = graph.degrees.reduce(max) +val maxInDegree: (VertexId, Int) = graph.inDegrees.reduce(max) +val maxOutDegree: (VertexId, Int) = graph.outDegrees.reduce(max) +val maxDegrees: (VertexId, Int) = graph.degrees.reduce(max) {% endhighlight %} ### Collecting Neighbors @@ -578,14 +665,14 @@ attributes at each vertex. This can be easily accomplished using the [`collectNeighborIds`][GraphOps.collectNeighborIds] and the [`collectNeighbors`][GraphOps.collectNeighbors] operators. -[GraphOps.collectNeighborIds]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@collectNeighborIds(EdgeDirection):VertexRDD[Array[VertexID]] -[GraphOps.collectNeighbors]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@collectNeighbors(EdgeDirection):VertexRDD[Array[(VertexID,VD)]] +[GraphOps.collectNeighborIds]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@collectNeighborIds(EdgeDirection):VertexRDD[Array[VertexId]] +[GraphOps.collectNeighbors]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@collectNeighbors(EdgeDirection):VertexRDD[Array[(VertexId,VD)]] {% highlight scala %} class GraphOps[VD, ED] { - def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexID]] - def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[ Array[(VertexID, VD)] ] + def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexId]] + def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[ Array[(VertexId, VD)] ] } {% endhighlight %} @@ -593,11 +680,20 @@ class GraphOps[VD, ED] { > substantial communication. If possible try expressing the same computation using the > `mapReduceTriplets` operator directly. +## Caching and Uncaching + +In Spark, RDDs are not persisted in memory by default. To avoid recomputation, they must be explicitly cached when using them multiple times (see the [Spark Programming Guide][RDD Persistence]). Graphs in GraphX behave the same way. **When using a graph multiple times, make sure to call [`Graph.cache()`][Graph.cache] on it first.** + +[RDD Persistence]: scala-programming-guide.html#rdd-persistence +[Graph.cache]: api/graphx/index.html#org.apache.spark.graphx.Graph@cache():Graph[VD,ED] + +In iterative computations, *uncaching* may also be necessary for best performance. By default, cached RDDs and graphs will remain in memory until memory pressure forces them to be evicted in LRU order. For iterative computation, intermediate results from previous iterations will fill up the cache. Though they will eventually be evicted, the unnecessary data stored in memory will slow down garbage collection. It would be more efficient to uncache intermediate results as soon as they are no longer necessary. This involves materializing (caching and forcing) a graph or RDD every iteration, uncaching all other datasets, and only using the materialized dataset in future iterations. However, because graphs are composed of multiple RDDs, it can be difficult to unpersist them correctly. **For iterative computation we recommend using the Pregel API, which correctly unpersists intermediate results.** + # Pregel API Graphs are inherently recursive data-structures as properties of vertices depend on properties of -their neighbors which intern depend on properties of *their* neighbors. As a +their neighbors which in turn depend on properties of *their* neighbors. As a consequence many important graph algorithms iteratively recompute the properties of each vertex until a fixed-point condition is reached. A range of graph-parallel abstractions have been proposed to express these iterative algorithms. GraphX exposes a Pregel-like operator which is a fusion of @@ -620,7 +716,7 @@ messages remaining. The following is the type signature of the [Pregel operator][GraphOps.pregel] as well as a *sketch* of its implementation (note calls to graph.cache have been removed): -[GraphOps.pregel]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@pregel[A](A,Int,EdgeDirection)((VertexID,VD,A)ā‡’VD,(EdgeTriplet[VD,ED])ā‡’Iterator[(VertexID,A)],(A,A)ā‡’A)(ClassTag[A]):Graph[VD,ED] +[GraphOps.pregel]: api/graphx/index.html#org.apache.spark.graphx.GraphOps@pregel[A](A,Int,EdgeDirection)((VertexId,VD,A)ā‡’VD,(EdgeTriplet[VD,ED])ā‡’Iterator[(VertexId,A)],(A,A)ā‡’A)(ClassTag[A]):Graph[VD,ED] {% highlight scala %} class GraphOps[VD, ED] { @@ -628,8 +724,8 @@ class GraphOps[VD, ED] { (initialMsg: A, maxIter: Int = Int.MaxValue, activeDir: EdgeDirection = EdgeDirection.Out) - (vprog: (VertexID, VD, A) => VD, - sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexID, A)], + (vprog: (VertexId, VD, A) => VD, + sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)], mergeMsg: (A, A) => A) : Graph[VD, ED] = { // Receive the initial message at each vertex @@ -674,7 +770,7 @@ import org.apache.spark.graphx.util.GraphGenerators // A graph with edge attributes containing distances val graph: Graph[Int, Double] = GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble) -val sourceId: VertexID = 42 // The ultimate source +val sourceId: VertexId = 42 // The ultimate source // Initialize the graph such that all vertices except the root have distance infinity. val initialGraph = graph.mapVertices((id, _) => if (id == sourceId) 0.0 else Double.PositiveInfinity) val sssp = initialGraph.pregel(Double.PositiveInfinity)( @@ -721,7 +817,7 @@ It creates a `Graph` from the specified edges, automatically creating any vertic {% highlight scala %} object Graph { def apply[VD, ED]( - vertices: RDD[(VertexID, VD)], + vertices: RDD[(VertexId, VD)], edges: RDD[Edge[ED]], defaultVertexAttr: VD = null) : Graph[VD, ED] @@ -731,7 +827,7 @@ object Graph { defaultValue: VD): Graph[VD, ED] def fromEdgeTuples[VD]( - rawEdges: RDD[(VertexID, VertexID)], + rawEdges: RDD[(VertexId, VertexId)], defaultValue: VD, uniqueEdges: Option[PartitionStrategy] = None): Graph[VD, Int] @@ -747,8 +843,8 @@ object Graph { [PartitionStrategy]: api/graphx/index.html#org.apache.spark.graphx.PartitionStrategy$ [GraphLoader.edgeListFile]: api/graphx/index.html#org.apache.spark.graphx.GraphLoader$@edgeListFile(SparkContext,String,Boolean,Int):Graph[Int,Int] -[Graph.apply]: api/graphx/index.html#org.apache.spark.graphx.Graph$@apply[VD,ED](RDD[(VertexID,VD)],RDD[Edge[ED]],VD)(ClassTag[VD],ClassTag[ED]):Graph[VD,ED] -[Graph.fromEdgeTuples]: api/graphx/index.html#org.apache.spark.graphx.Graph$@fromEdgeTuples[VD](RDD[(VertexID,VertexID)],VD,Option[PartitionStrategy])(ClassTag[VD]):Graph[VD,Int] +[Graph.apply]: api/graphx/index.html#org.apache.spark.graphx.Graph$@apply[VD,ED](RDD[(VertexId,VD)],RDD[Edge[ED]],VD)(ClassTag[VD],ClassTag[ED]):Graph[VD,ED] +[Graph.fromEdgeTuples]: api/graphx/index.html#org.apache.spark.graphx.Graph$@fromEdgeTuples[VD](RDD[(VertexId,VertexId)],VD,Option[PartitionStrategy])(ClassTag[VD]):Graph[VD,Int] [Graph.fromEdges]: api/graphx/index.html#org.apache.spark.graphx.Graph$@fromEdges[VD,ED](RDD[Edge[ED]],VD)(ClassTag[VD],ClassTag[ED]):Graph[VD,ED] # Vertex and Edge RDDs @@ -761,47 +857,46 @@ respectively. In this section we review some of the additional useful functiona ## VertexRDDs -The `VertexRDD[A]` extends the more traditional `RDD[(VertexId, A)]` but adds the additional -constraint that each `VertexId` occurs only *once*. Moreover, `VertexRDD[A]` represents a *set* of -vertices each with an attribute of type `A`. Internally, this is achieved by storing the vertex -attributes in a reusable hash-map data-structure. As a consequence if two `VertexRDD`s are derived -from the same base `VertexRDD` (e.g., by `filter` or `mapValues`) they can be joined in constant -time without hash evaluations. To leverage this indexed data-structure, the `VertexRDD` exposes the -following additional functionality: +The `VertexRDD[A]` extends `RDD[(VertexID, A)]` and adds the additional constraint that each +`VertexID` occurs only *once*. Moreover, `VertexRDD[A]` represents a *set* of vertices each with an +attribute of type `A`. Internally, this is achieved by storing the vertex attributes in a reusable +hash-map data-structure. As a consequence if two `VertexRDD`s are derived from the same base +`VertexRDD` (e.g., by `filter` or `mapValues`) they can be joined in constant time without hash +evaluations. To leverage this indexed data-structure, the `VertexRDD` exposes the following +additional functionality: {% highlight scala %} -class VertexRDD[VD] { +class VertexRDD[VD] extends RDD[(VertexID, VD)] { // Filter the vertex set but preserves the internal index - def filter(pred: Tuple2[VertexID, VD] => Boolean): VertexRDD[VD] + def filter(pred: Tuple2[VertexId, VD] => Boolean): VertexRDD[VD] // Transform the values without changing the ids (preserves the internal index) def mapValues[VD2](map: VD => VD2): VertexRDD[VD2] - def mapValues[VD2](map: (VertexID, VD) => VD2): VertexRDD[VD2] + def mapValues[VD2](map: (VertexId, VD) => VD2): VertexRDD[VD2] // Remove vertices from this set that appear in the other set def diff(other: VertexRDD[VD]): VertexRDD[VD] // Join operators that take advantage of the internal indexing to accelerate joins (substantially) - def leftJoin[VD2, VD3](other: RDD[(VertexID, VD2)])(f: (VertexID, VD, Option[VD2]) => VD3): VertexRDD[VD3] - def innerJoin[U, VD2](other: RDD[(VertexID, U)])(f: (VertexID, VD, U) => VD2): VertexRDD[VD2] + def leftJoin[VD2, VD3](other: RDD[(VertexId, VD2)])(f: (VertexId, VD, Option[VD2]) => VD3): VertexRDD[VD3] + def innerJoin[U, VD2](other: RDD[(VertexId, U)])(f: (VertexId, VD, U) => VD2): VertexRDD[VD2] // Use the index on this RDD to accelerate a `reduceByKey` operation on the input RDD. - def aggregateUsingIndex[VD2](other: RDD[(VertexID, VD2)], reduceFunc: (VD2, VD2) => VD2): VertexRDD[VD2] + def aggregateUsingIndex[VD2](other: RDD[(VertexId, VD2)], reduceFunc: (VD2, VD2) => VD2): VertexRDD[VD2] } {% endhighlight %} Notice, for example, how the `filter` operator returns an `VertexRDD`. Filter is actually implemented using a `BitSet` thereby reusing the index and preserving the ability to do fast joins with other `VertexRDD`s. Likewise, the `mapValues` operators do not allow the `map` function to -change the `VertexId` thereby enabling the same `HashMap` data-structures to be reused. Both the +change the `VertexID` thereby enabling the same `HashMap` data-structures to be reused. Both the `leftJoin` and `innerJoin` are able to identify when joining two `VertexRDD`s derived from the same `HashMap` and implement the join by linear scan rather than costly point lookups. -The `aggregateUsingIndex` operator can be slightly confusing but is also useful for efficient -construction of a new `VertexRDD` from an `RDD[(VertexId, A)]`. Conceptually, if I have constructed -a `VertexRDD[B]` over a set of vertices, *which is a super-set* of the vertices in some -`RDD[(VertexId, A)]` then I can reuse the index to both aggregate and then subsequently index the -RDD. For example: +The `aggregateUsingIndex` operator is useful for efficient construction of a new `VertexRDD` from an +`RDD[(VertexID, A)]`. Conceptually, if I have constructed a `VertexRDD[B]` over a set of vertices, +*which is a super-set* of the vertices in some `RDD[(VertexID, A)]` then I can reuse the index to +both aggregate and then subsequently index the `RDD[(VertexID, A)]`. For example: {% highlight scala %} val setA: VertexRDD[Int] = VertexRDD(sc.parallelize(0L until 100L).map(id => (id, 1))) -val rddB: RDD[(VertexID, Double)] = sc.parallelize(0L until 100L).flatMap(id => List((id, 1.0), (id, 2.0))) +val rddB: RDD[(VertexId, Double)] = sc.parallelize(0L until 100L).flatMap(id => List((id, 1.0), (id, 2.0))) // There should be 200 entries in rddB rddB.count val setB: VertexRDD[Double] = setA.aggregateUsingIndex(rddB, _ + _) @@ -813,10 +908,10 @@ val setC: VertexRDD[Double] = setA.innerJoin(setB)((id, a, b) => a + b) ## EdgeRDDs -The `EdgeRDD[ED]`, which extends `RDD[Edge[ED]]` is considerably simpler than the `VertexRDD`. -GraphX organizes the edges in blocks partitioned using one of the various partitioning strategies -defined in [`PartitionStrategy`][PartitionStrategy]. Within each partition, edge attributes and -adjacency structure, are stored separately enabling maximum reuse when changing attribute values. +The `EdgeRDD[ED]`, which extends `RDD[Edge[ED]]` organizes the edges in blocks partitioned using one +of the various partitioning strategies defined in [`PartitionStrategy`][PartitionStrategy]. Within +each partition, edge attributes and adjacency structure, are stored separately enabling maximum +reuse when changing attribute values. [PartitionStrategy]: api/graphx/index.html#org.apache.spark.graphx.PartitionStrategy @@ -827,11 +922,11 @@ def mapValues[ED2](f: Edge[ED] => ED2): EdgeRDD[ED2] // Revere the edges reusing both attributes and structure def reverse: EdgeRDD[ED] // Join two `EdgeRDD`s partitioned using the same partitioning strategy. -def innerJoin[ED2, ED3](other: EdgeRDD[ED2])(f: (VertexID, VertexID, ED, ED2) => ED3): EdgeRDD[ED3] +def innerJoin[ED2, ED3](other: EdgeRDD[ED2])(f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDD[ED3] {% endhighlight %} In most applications we have found that operations on the `EdgeRDD` are accomplished through the -graph or rely on operations defined in the base `RDD` class. +graph operators or rely on operations defined in the base `RDD` class. # Optimized Representation @@ -853,7 +948,9 @@ reduce both the communication and storage overhead. Logically, this corresponds to machines and allowing vertices to span multiple machines. The exact method of assigning edges depends on the [`PartitionStrategy`][PartitionStrategy] and there are several tradeoffs to the various heuristics. Users can choose between different strategies by repartitioning the graph with -the [`Graph.partitionBy`][Graph.partitionBy] operator. +the [`Graph.partitionBy`][Graph.partitionBy] operator. The default partitioning strategy is to use +the initial partitioning of the edges as provided on graph construction. However, users can easily +switch to 2D-partitioning or other heuristics included in GraphX. [Graph.partitionBy]: api/graphx/index.html#org.apache.spark.graphx.Graph$@partitionBy(partitionStrategy:org.apache.spark.graphx.PartitionStrategy):org.apache.spark.graphx.Graph[VD,ED] @@ -867,16 +964,15 @@ the [`Graph.partitionBy`][Graph.partitionBy] operator. Once the edges have be partitioned the key challenge to efficient graph-parallel computation is efficiently joining vertex attributes with the edges. Because real-world graphs typically have more -edges than vertices, we move vertex attributes to the edges. - - - - +edges than vertices, we move vertex attributes to the edges. Because not all partitions will +contain edges adjacent to all vertices we internally maintain a routing table which identifies where +to broadcast vertices when implementing the join required for operations like `triplets` and +`mapReduceTriplets`. # Graph Algorithms -GraphX includes a set of graph algorithms in to simplify analytics. The algorithms are contained in the `org.apache.spark.graphx.lib` package and can be accessed directly as methods on `Graph` via [`GraphOps`][GraphOps]. This section describes the algorithms and how they are used. +GraphX includes a set of graph algorithms to simplify analytics tasks. The algorithms are contained in the `org.apache.spark.graphx.lib` package and can be accessed directly as methods on `Graph` via [`GraphOps`][GraphOps]. This section describes the algorithms and how they are used. ## PageRank @@ -953,13 +1049,6 @@ val triCountByUsername = users.join(triCounts).map { case (id, (username, tc)) = println(triCountByUsername.collect().mkString("\n")) {% endhighlight %} -

    - Tables and Graphs - -

    # Examples diff --git a/docs/img/java-sm.png b/docs/img/java-sm.png new file mode 100644 index 0000000000000..a82ee7d682e49 Binary files /dev/null and b/docs/img/java-sm.png differ diff --git a/docs/img/python-sm.png b/docs/img/python-sm.png new file mode 100644 index 0000000000000..ae01e05252abd Binary files /dev/null and b/docs/img/python-sm.png differ diff --git a/docs/img/scala-sm.png b/docs/img/scala-sm.png new file mode 100644 index 0000000000000..30db034b70cf9 Binary files /dev/null and b/docs/img/scala-sm.png differ diff --git a/docs/img/streaming-arch.png b/docs/img/streaming-arch.png new file mode 100644 index 0000000000000..bc57b460fdf8b Binary files /dev/null and b/docs/img/streaming-arch.png differ diff --git a/docs/img/streaming-dstream-ops.png b/docs/img/streaming-dstream-ops.png new file mode 100644 index 0000000000000..a1c5634aa3c3a Binary files /dev/null and b/docs/img/streaming-dstream-ops.png differ diff --git a/docs/img/streaming-dstream-window.png b/docs/img/streaming-dstream-window.png new file mode 100644 index 0000000000000..276d2fee5e30e Binary files /dev/null and b/docs/img/streaming-dstream-window.png differ diff --git a/docs/img/streaming-dstream.png b/docs/img/streaming-dstream.png new file mode 100644 index 0000000000000..90f43b8c7138c Binary files /dev/null and b/docs/img/streaming-dstream.png differ diff --git a/docs/img/streaming-figures.pptx b/docs/img/streaming-figures.pptx new file mode 100644 index 0000000000000..1b18c2ee0ea3e Binary files /dev/null and b/docs/img/streaming-figures.pptx differ diff --git a/docs/img/streaming-flow.png b/docs/img/streaming-flow.png new file mode 100644 index 0000000000000..a870cb9b1839b Binary files /dev/null and b/docs/img/streaming-flow.png differ diff --git a/docs/index.md b/docs/index.md index debdb33108676..4eb297df39144 100644 --- a/docs/index.md +++ b/docs/index.md @@ -9,7 +9,7 @@ It also supports a rich set of higher-level tools including [Shark](http://shark # Downloading -Get Spark by visiting the [downloads page](http://spark.incubator.apache.org/downloads.html) of the Apache Spark site. This documentation is for Spark version {{site.SPARK_VERSION}}. +Get Spark by visiting the [downloads page](http://spark.apache.org/downloads.html) of the Apache Spark site. This documentation is for Spark version {{site.SPARK_VERSION}}. Spark runs on both Windows and UNIX-like systems (e.g. Linux, Mac OS). All you need to run it is to have `java` to installed on your system `PATH`, or the `JAVA_HOME` environment variable pointing to a Java installation. @@ -19,7 +19,7 @@ Spark uses [Simple Build Tool](http://www.scala-sbt.org), which is bundled with sbt/sbt assembly -For its Scala API, Spark {{site.SPARK_VERSION}} depends on Scala {{site.SCALA_VERSION}}. If you write applications in Scala, you will need to use this same version of Scala in your own program -- newer major versions may not work. You can get the right version of Scala from [scala-lang.org](http://www.scala-lang.org/download/). +For its Scala API, Spark {{site.SPARK_VERSION}} depends on Scala {{site.SCALA_BINARY_VERSION}}. If you write applications in Scala, you will need to use a compatible Scala version (e.g. {{site.SCALA_BINARY_VERSION}}.X) -- newer major versions may not work. You can get the right version of Scala from [scala-lang.org](http://www.scala-lang.org/download/). # Running the Examples and Shell @@ -75,7 +75,7 @@ For this version of Spark (0.8.1) Hadoop 2.2.x (or newer) users will have to bui * [Spark Programming Guide](scala-programming-guide.html): an overview of Spark concepts, and details on the Scala API * [Java Programming Guide](java-programming-guide.html): using Spark from Java * [Python Programming Guide](python-programming-guide.html): using Spark from Python -* [Spark Streaming](streaming-programming-guide.html): using the alpha release of Spark Streaming +* [Spark Streaming](streaming-programming-guide.html): Spark's API for processing data streams * [MLlib (Machine Learning)](mllib-guide.html): Spark's built-in machine learning library * [Bagel (Pregel on Spark)](bagel-programming-guide.html): simple graph processing model * [GraphX (Graphs on Spark)](graphx-programming-guide.html): Spark's new API for graphs @@ -96,7 +96,7 @@ For this version of Spark (0.8.1) Hadoop 2.2.x (or newer) users will have to bui * [Amazon EC2](ec2-scripts.html): scripts that let you launch a cluster on EC2 in about 5 minutes * [Standalone Deploy Mode](spark-standalone.html): launch a standalone cluster quickly without a third-party cluster manager * [Mesos](running-on-mesos.html): deploy a private cluster using - [Apache Mesos](http://incubator.apache.org/mesos) + [Apache Mesos](http://mesos.apache.org) * [YARN](running-on-yarn.html): deploy Spark on top of Hadoop NextGen (YARN) **Other documents:** @@ -110,20 +110,20 @@ For this version of Spark (0.8.1) Hadoop 2.2.x (or newer) users will have to bui **External resources:** -* [Spark Homepage](http://spark.incubator.apache.org) +* [Spark Homepage](http://spark.apache.org) * [Shark](http://shark.cs.berkeley.edu): Apache Hive over Spark -* [Mailing Lists](http://spark.incubator.apache.org/mailing-lists.html): ask questions about Spark here +* [Mailing Lists](http://spark.apache.org/mailing-lists.html): ask questions about Spark here * [AMP Camps](http://ampcamp.berkeley.edu/): a series of training camps at UC Berkeley that featured talks and exercises about Spark, Shark, Mesos, and more. [Videos](http://ampcamp.berkeley.edu/agenda-2012), [slides](http://ampcamp.berkeley.edu/agenda-2012) and [exercises](http://ampcamp.berkeley.edu/exercises-2012) are available online for free. -* [Code Examples](http://spark.incubator.apache.org/examples.html): more are also available in the [examples subfolder](https://github.com/apache/incubator-spark/tree/master/examples/src/main/scala/) of Spark +* [Code Examples](http://spark.apache.org/examples.html): more are also available in the [examples subfolder](https://github.com/apache/spark/tree/master/examples/src/main/scala/) of Spark * [Paper Describing Spark](http://www.cs.berkeley.edu/~matei/papers/2012/nsdi_spark.pdf) * [Paper Describing Spark Streaming](http://www.eecs.berkeley.edu/Pubs/TechRpts/2012/EECS-2012-259.pdf) # Community -To get help using Spark or keep up with Spark development, sign up for the [user mailing list](http://spark.incubator.apache.org/mailing-lists.html). +To get help using Spark or keep up with Spark development, sign up for the [user mailing list](http://spark.apache.org/mailing-lists.html). If you're in the San Francisco Bay Area, there's a regular [Spark meetup](http://www.meetup.com/spark-users/) every few weeks. Come by to meet the developers and other users. diff --git a/docs/java-programming-guide.md b/docs/java-programming-guide.md index 07732fa1229f3..5c73dbb25ede8 100644 --- a/docs/java-programming-guide.md +++ b/docs/java-programming-guide.md @@ -189,7 +189,7 @@ We hope to generate documentation with Java-style syntax in the future. # Where to Go from Here Spark includes several sample programs using the Java API in -[`examples/src/main/java`](https://github.com/apache/incubator-spark/tree/master/examples/src/main/java/org/apache/spark/examples). You can run them by passing the class name to the +[`examples/src/main/java`](https://github.com/apache/spark/tree/master/examples/src/main/java/org/apache/spark/examples). You can run them by passing the class name to the `bin/run-example` script included in Spark; for example: ./bin/run-example org.apache.spark.examples.JavaWordCount diff --git a/docs/js/main.js b/docs/js/main.js index 8b137891791fe..0bd2286cced19 100755 --- a/docs/js/main.js +++ b/docs/js/main.js @@ -1 +1,80 @@ +function codeTabs() { + var counter = 0; + var langImages = { + "scala": "img/scala-sm.png", + "python": "img/python-sm.png", + "java": "img/java-sm.png" + }; + $("div.codetabs").each(function() { + $(this).addClass("tab-content"); + // Insert the tab bar + var tabBar = $(''); + $(this).before(tabBar); + + // Add each code sample to the tab bar: + var codeSamples = $(this).children("div"); + codeSamples.each(function() { + $(this).addClass("tab-pane"); + var lang = $(this).data("lang"); + var image = $(this).data("image"); + var notabs = $(this).data("notabs"); + var capitalizedLang = lang.substr(0, 1).toUpperCase() + lang.substr(1); + var id = "tab_" + lang + "_" + counter; + $(this).attr("id", id); + if (image != null && langImages[lang]) { + var buttonLabel = "" + capitalizedLang + ""; + } else if (notabs == null) { + var buttonLabel = "" + capitalizedLang + ""; + } else { + var buttonLabel = "" + } + tabBar.append( + '
  • ' + buttonLabel + '
  • ' + ); + }); + + codeSamples.first().addClass("active"); + tabBar.children("li").first().addClass("active"); + counter++; + }); + $("ul.nav-tabs a").click(function (e) { + // Toggling a tab should switch all tabs corresponding to the same language + // while retaining the scroll position + e.preventDefault(); + var scrollOffset = $(this).offset().top - $(document).scrollTop(); + $("." + $(this).attr('class')).tab('show'); + $(document).scrollTop($(this).offset().top - scrollOffset); + }); +} + +function makeCollapsable(elt, accordionClass, accordionBodyId, title) { + $(elt).addClass("accordion-inner"); + $(elt).wrap('
    ') + $(elt).wrap('
    ') + $(elt).wrap('
    ') + $(elt).parent().before( + '
    ' + + '' + + title + + '' + + '
    ' + ); +} + +function viewSolution() { + var counter = 0 + $("div.solution").each(function() { + var id = "solution_" + counter + makeCollapsable(this, "", id, + '' + + '' + "View Solution"); + counter++; + }); +} + + +$(function() { + codeTabs(); + viewSolution(); +}); diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index 1a5c640d10df4..a22a22184b5c6 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -21,7 +21,8 @@ depends on native Fortran routines. You may need to install the if it is not already present on your nodes. MLlib will throw a linking error if it cannot detect these libraries automatically. -To use MLlib in Python, you will also need [NumPy](http://www.numpy.org) version 1.7 or newer. +To use MLlib in Python, you will need [NumPy](http://www.numpy.org) version 1.7 or newer +and Python 2.7. # Binary Classification diff --git a/docs/monitoring.md b/docs/monitoring.md index 0d5eb7065e9f0..242318550c127 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -48,11 +48,22 @@ Each instance can report to zero or more _sinks_. Sinks are contained in the * `ConsoleSink`: Logs metrics information to the console. * `CSVSink`: Exports metrics data to CSV files at regular intervals. -* `GangliaSink`: Sends metrics to a Ganglia node or multicast group. * `JmxSink`: Registers metrics for viewing in a JXM console. * `MetricsServlet`: Adds a servlet within the existing Spark UI to serve metrics data as JSON data. * `GraphiteSink`: Sends metrics to a Graphite node. +Spark also supports a Ganglia sink which is not included in the default build due to +licensing restrictions: + +* `GangliaSink`: Sends metrics to a Ganglia node or multicast group. + +To install the `GangliaSink` you'll need to perform a custom build of Spark. _**Note that +by embedding this library you will include [LGPL](http://www.gnu.org/copyleft/lesser.html)-licensed +code in your Spark package**_. For sbt users, set the +`SPARK_GANGLIA_LGPL` environment variable before building. For Maven users, enable +the `-Pspark-ganglia-lgpl` profile. In addition to modifying the cluster's Spark build +user applications will need to link to the `spark-ganglia-lgpl` artifact. + The syntax of the metrics configuration file is defined in an example configuration file, `$SPARK_HOME/conf/metrics.conf.template`. diff --git a/docs/python-programming-guide.md b/docs/python-programming-guide.md index b07899c2e176d..7c5283fb0b6fb 100644 --- a/docs/python-programming-guide.md +++ b/docs/python-programming-guide.md @@ -52,7 +52,7 @@ In addition, PySpark fully supports interactive use---simply run `./bin/pyspark` # Installing and Configuring PySpark -PySpark requires Python 2.7 or higher. +PySpark requires Python 2.6 or higher. PySpark applications are executed using a standard CPython interpreter in order to support Python modules that use C extensions. We have not tested PySpark with Python 3 or with alternative Python interpreters, such as [PyPy](http://pypy.org/) or [Jython](http://www.jython.org/). @@ -152,7 +152,7 @@ Many of the methods also contain [doctests](http://docs.python.org/2/library/doc # Libraries [MLlib](mllib-guide.html) is also available in PySpark. To use it, you'll need -[NumPy](http://www.numpy.org) version 1.7 or newer. The [MLlib guide](mllib-guide.html) contains +[NumPy](http://www.numpy.org) version 1.7 or newer, and Python 2.7. The [MLlib guide](mllib-guide.html) contains some example applications. # Where to Go from Here diff --git a/docs/quick-start.md b/docs/quick-start.md index 153081bdaa286..13df6beea16e8 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -115,7 +115,7 @@ object SimpleApp { def main(args: Array[String]) { val logFile = "$YOUR_SPARK_HOME/README.md" // Should be some file on your system val sc = new SparkContext("local", "Simple App", "YOUR_SPARK_HOME", - List("target/scala-{{site.SCALA_VERSION}}/simple-project_{{site.SCALA_VERSION}}-1.0.jar")) + List("target/scala-{{site.SCALA_BINARY_VERSION}}/simple-project_{{site.SCALA_BINARY_VERSION}}-1.0.jar")) val logData = sc.textFile(logFile, 2).cache() val numAs = logData.filter(line => line.contains("a")).count() val numBs = logData.filter(line => line.contains("b")).count() @@ -214,7 +214,7 @@ To build the program, we also write a Maven `pom.xml` file that lists Spark as a org.apache.spark - spark-core_{{site.SCALA_VERSION}} + spark-core_{{site.SCALA_BINARY_VERSION}} {{site.SPARK_VERSION}} diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md index 3bd62646bab06..cd4509ede735a 100644 --- a/docs/running-on-yarn.md +++ b/docs/running-on-yarn.md @@ -15,7 +15,7 @@ This can be built by setting the Hadoop version and `SPARK_YARN` environment var SPARK_HADOOP_VERSION=2.0.5-alpha SPARK_YARN=true sbt/sbt assembly The assembled JAR will be something like this: -`./assembly/target/scala-{{site.SCALA_VERSION}}/spark-assembly_{{site.SPARK_VERSION}}-hadoop2.0.5.jar`. +`./assembly/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-assembly_{{site.SPARK_VERSION}}-hadoop2.0.5.jar`. The build process now also supports new YARN versions (2.2.x). See below. @@ -25,7 +25,7 @@ The build process now also supports new YARN versions (2.2.x). See below. - The assembled jar can be installed into HDFS or used locally. - Your application code must be packaged into a separate JAR file. -If you want to test out the YARN deployment mode, you can use the current Spark examples. A `spark-examples_{{site.SCALA_VERSION}}-{{site.SPARK_VERSION}}` file can be generated by running `sbt/sbt assembly`. NOTE: since the documentation you're reading is for Spark version {{site.SPARK_VERSION}}, we are assuming here that you have downloaded Spark {{site.SPARK_VERSION}} or checked it out of source control. If you are using a different version of Spark, the version numbers in the jar generated by the sbt package command will obviously be different. +If you want to test out the YARN deployment mode, you can use the current Spark examples. A `spark-examples_{{site.SCALA_BINARY_VERSION}}-{{site.SPARK_VERSION}}` file can be generated by running `sbt/sbt assembly`. NOTE: since the documentation you're reading is for Spark version {{site.SPARK_VERSION}}, we are assuming here that you have downloaded Spark {{site.SPARK_VERSION}} or checked it out of source control. If you are using a different version of Spark, the version numbers in the jar generated by the sbt package command will obviously be different. # Configuration @@ -78,9 +78,9 @@ For example: $ cp conf/log4j.properties.template conf/log4j.properties # Submit Spark's ApplicationMaster to YARN's ResourceManager, and instruct Spark to run the SparkPi example - $ SPARK_JAR=./assembly/target/scala-{{site.SCALA_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop2.0.5-alpha.jar \ + $ SPARK_JAR=./assembly/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop2.0.5-alpha.jar \ ./bin/spark-class org.apache.spark.deploy.yarn.Client \ - --jar examples/target/scala-{{site.SCALA_VERSION}}/spark-examples-assembly-{{site.SPARK_VERSION}}.jar \ + --jar examples/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-examples-assembly-{{site.SPARK_VERSION}}.jar \ --class org.apache.spark.examples.SparkPi \ --args yarn-standalone \ --num-workers 3 \ @@ -117,13 +117,13 @@ In order to tune worker core/number/memory etc. You need to export environment v For example: - SPARK_JAR=./assembly/target/scala-{{site.SCALA_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop2.0.5-alpha.jar \ - SPARK_YARN_APP_JAR=examples/target/scala-{{site.SCALA_VERSION}}/spark-examples-assembly-{{site.SPARK_VERSION}}.jar \ + SPARK_JAR=./assembly/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop2.0.5-alpha.jar \ + SPARK_YARN_APP_JAR=examples/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-examples-assembly-{{site.SPARK_VERSION}}.jar \ ./bin/run-example org.apache.spark.examples.SparkPi yarn-client - SPARK_JAR=./assembly/target/scala-{{site.SCALA_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop2.0.5-alpha.jar \ - SPARK_YARN_APP_JAR=examples/target/scala-{{site.SCALA_VERSION}}/spark-examples-assembly-{{site.SPARK_VERSION}}.jar \ + SPARK_JAR=./assembly/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop2.0.5-alpha.jar \ + SPARK_YARN_APP_JAR=examples/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-examples-assembly-{{site.SPARK_VERSION}}.jar \ MASTER=yarn-client ./bin/spark-shell @@ -133,7 +133,7 @@ See [Building Spark with Maven](building-with-maven.html) for instructions on ho # Important Notes -- We do not requesting container resources based on the number of cores. Thus the numbers of cores given via command line arguments cannot be guaranteed. +- Before Hadoop 2.2, YARN does not support cores in container resource requests. Thus, when running against an earlier version, the numbers of cores given via command line arguments cannot be passed to YARN. Whether core requests are honored in scheduling decisions depends on which scheduler is in use and how it is configured. - The local directories used for spark will be the local directories configured for YARN (Hadoop Yarn config yarn.nodemanager.local-dirs). If the user specifies spark.local.dir, it will be ignored. - The --files and --archives options support specifying file names with the # similar to Hadoop. For example you can specify: --files localtest.txt#appSees.txt and this will upload the file you have locally named localtest.txt into HDFS but this will be linked to by the name appSees.txt and your application should use the name as appSees.txt to reference it when running on YARN. - The --addJars option allows the SparkContext.addJar function to work if you are using it with local files. It does not need to be used if you are using it with HDFS, HTTP, HTTPS, or FTP files. diff --git a/docs/scala-programming-guide.md b/docs/scala-programming-guide.md index c1ef46a1cded7..99412733d4268 100644 --- a/docs/scala-programming-guide.md +++ b/docs/scala-programming-guide.md @@ -17,12 +17,12 @@ This guide shows each of these features and walks through some samples. It assum # Linking with Spark -Spark {{site.SPARK_VERSION}} uses Scala {{site.SCALA_VERSION}}. If you write applications in Scala, you'll need to use this same version of Scala in your program -- newer major versions may not work. +Spark {{site.SPARK_VERSION}} uses Scala {{site.SCALA_BINARY_VERSION}}. If you write applications in Scala, you will need to use a compatible Scala version (e.g. {{site.SCALA_BINARY_VERSION}}.X) -- newer major versions may not work. To write a Spark application, you need to add a dependency on Spark. If you use SBT or Maven, Spark is available through Maven Central at: groupId = org.apache.spark - artifactId = spark-core_{{site.SCALA_VERSION}} + artifactId = spark-core_{{site.SCALA_BINARY_VERSION}} version = {{site.SPARK_VERSION}} In addition, if you wish to access an HDFS cluster, you need to add a dependency on `hadoop-client` for your version of HDFS: @@ -31,7 +31,7 @@ In addition, if you wish to access an HDFS cluster, you need to add a dependency artifactId = hadoop-client version = -For other build systems, you can run `sbt/sbt assembly` to pack Spark and its dependencies into one JAR (`assembly/target/scala-{{site.SCALA_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop*.jar`), then add this to your CLASSPATH. Set the HDFS version as described [here](index.html#a-note-about-hadoop-versions). +For other build systems, you can run `sbt/sbt assembly` to pack Spark and its dependencies into one JAR (`assembly/target/scala-{{site.SCALA_BINARY_VERSION}}/spark-assembly-{{site.SPARK_VERSION}}-hadoop*.jar`), then add this to your CLASSPATH. Set the HDFS version as described [here](index.html#a-note-about-hadoop-versions). Finally, you need to import some Spark classes and implicit conversions into your program. Add the following lines: @@ -168,9 +168,9 @@ The following tables list the transformations and actions currently supported (s Iterator[T] => Iterator[U] when running on an RDD of type T. - mapPartitionsWithSplit(func) + mapPartitionsWithIndex(func) Similar to mapPartitions, but also provides func with an integer value representing the index of - the split, so func must be of type (Int, Iterator[T]) => Iterator[U] when running on an RDD of type T. + the partition, so func must be of type (Int, Iterator[T]) => Iterator[U] when running on an RDD of type T. @@ -344,7 +344,7 @@ After the broadcast variable is created, it should be used instead of the value ## Accumulators -Accumulators are variables that are only "added" to through an associative operation and can therefore be efficiently supported in parallel. They can be used to implement counters (as in MapReduce) or sums. Spark natively supports accumulators of type Int and Double, and programmers can add support for new types. +Accumulators are variables that are only "added" to through an associative operation and can therefore be efficiently supported in parallel. They can be used to implement counters (as in MapReduce) or sums. Spark natively supports accumulators of numeric value types and standard mutable collections, and programmers can add support for new types. An accumulator is created from an initial value `v` by calling `SparkContext.accumulator(v)`. Tasks running on the cluster can then add to it using the `+=` operator. However, they cannot read its value. Only the driver program can read the accumulator's value, using its `value` method. @@ -365,7 +365,7 @@ res2: Int = 10 # Where to Go from Here -You can see some [example Spark programs](http://spark.incubator.apache.org/examples.html) on the Spark website. +You can see some [example Spark programs](http://spark.apache.org/examples.html) on the Spark website. In addition, Spark includes several samples in `examples/src/main/scala`. Some of them have both Spark versions and local (non-parallel) versions, allowing you to see what had to be changed to make the program run on a cluster. You can run them using by passing the class name to the `bin/run-example` script included in Spark; for example: ./bin/run-example org.apache.spark.examples.SparkPi diff --git a/docs/spark-debugger.md b/docs/spark-debugger.md index 11c51d5cde7c9..891c2bfa8943d 100644 --- a/docs/spark-debugger.md +++ b/docs/spark-debugger.md @@ -2,7 +2,7 @@ layout: global title: The Spark Debugger --- -**Summary:** The Spark debugger provides replay debugging for deterministic (logic) errors in Spark programs. It's currently in development, but you can try it out in the [arthur branch](https://github.com/apache/incubator-spark/tree/arthur). +**Summary:** The Spark debugger provides replay debugging for deterministic (logic) errors in Spark programs. It's currently in development, but you can try it out in the [arthur branch](https://github.com/apache/spark/tree/arthur). ## Introduction @@ -19,7 +19,7 @@ For deterministic errors, debugging a Spark program is now as easy as debugging ## Approach -As your Spark program runs, the slaves report key events back to the master -- for example, RDD creations, RDD contents, and uncaught exceptions. (A full list of event types is in [EventLogging.scala](https://github.com/apache/incubator-spark/blob/arthur/core/src/main/scala/spark/EventLogging.scala).) The master logs those events, and you can load the event log into the debugger after your program is done running. +As your Spark program runs, the slaves report key events back to the master -- for example, RDD creations, RDD contents, and uncaught exceptions. (A full list of event types is in [EventLogging.scala](https://github.com/apache/spark/blob/arthur/core/src/main/scala/spark/EventLogging.scala).) The master logs those events, and you can load the event log into the debugger after your program is done running. _A note on nondeterminism:_ For fault recovery, Spark requires RDD transformations (for example, the function passed to `RDD.map`) to be deterministic. The Spark debugger also relies on this property, and it can also warn you if your transformation is nondeterministic. This works by checksumming the contents of each RDD and comparing the checksums from the original execution to the checksums after recomputing the RDD in the debugger. diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md index 2a186261b754a..3388c14ec4d48 100644 --- a/docs/spark-standalone.md +++ b/docs/spark-standalone.md @@ -151,7 +151,7 @@ You can also pass an option `-c ` to control the number of cores that You may also run your application entirely inside of the cluster by submitting your application driver using the submission client. The syntax for submitting applications is as follows: - ./spark-class org.apache.spark.deploy.Client launch + ./bin/spark-class org.apache.spark.deploy.Client launch [client-options] \ \ [application-options] @@ -176,7 +176,7 @@ Once you submit a driver program, it will appear in the cluster management UI at be assigned an identifier. If you'd like to prematurely terminate the program, you can do so using the same client: - ./spark-class org.apache.spark.deploy.client.DriverClient kill + ./bin/spark-class org.apache.spark.deploy.Client kill # Resource Scheduling diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md index 07c4c55633929..4985c52a11ada 100644 --- a/docs/streaming-programming-guide.md +++ b/docs/streaming-programming-guide.md @@ -7,74 +7,457 @@ title: Spark Streaming Programming Guide {:toc} # Overview -A Spark Streaming application is very similar to a Spark application; it consists of a *driver program* that runs the user's `main` function and continuous executes various *parallel operations* on input streams of data. The main abstraction Spark Streaming provides is a *discretized stream* (DStream), which is a continuous sequence of RDDs (distributed collections of elements) representing a continuous stream of data. DStreams can be created from live incoming data (such as data from a socket, Kafka, etc.) or can be generated by transforming existing DStreams using parallel operators like `map`, `reduce`, and `window`. The basic processing model is as follows: -(i) While a Spark Streaming driver program is running, the system receives data from various sources and and divides it into batches. Each batch of data is treated as an RDD, that is, an immutable parallel collection of data. These input RDDs are saved in memory and replicated to two nodes for fault-tolerance. This sequence of RDDs is collectively called an InputDStream. -(ii) Data received by InputDStreams are processed using DStream operations. Since all data is represented as RDDs and all DStream operations as RDD operations, data is automatically recovered in the event of node failures. +Spark Streaming is an extension of the core Spark API that allows enables high-throughput, +fault-tolerant stream processing of live data streams. Data can be ingested from many sources +like Kafka, Flume, Twitter, ZeroMQ or plain old TCP sockets and be processed using complex +algorithms expressed with high-level functions like `map`, `reduce`, `join` and `window`. +Finally, processed data can be pushed out to filesystems, databases, +and live dashboards. In fact, you can apply Spark's in-built +[machine learning](mllib-guide.html) algorithms, and +[graph processing](graphx-programming-guide.html) algorithms on data streams. + +

    + Spark Streaming +

    + +Internally, it works as follows. Spark Streaming receives live input data streams and divides +the data into batches, which are then processed by the Spark engine to generate the final +stream of results in batches. + +

    + Spark Streaming +

    + +Spark Streaming provides a high-level abstraction called *discretized stream* or *DStream*, +which represents a continuous stream of data. DStreams can be created either from input data +stream from sources such as Kafka and Flume, or by applying high-level +operations on other DStreams. Internally, a DStream is represented as a sequence of +[RDDs](api/core/index.html#org.apache.spark.rdd.RDD). + +This guide shows you how to start writing Spark Streaming programs with DStreams. You can +write Spark Streaming programs in Scala or Java, both of which are presented in this guide. You +will find tabs throughout this guide that let you choose between Scala and Java +code snippets. + +*************************************************************************************************** + +# A Quick Example +Before we go into the details of how to write your own Spark Streaming program, +let's take a quick look at what a simple Spark Streaming program looks like. Let's say we want to +count the number of words in text data received from a data server listening on a TCP +socket. All you need to +do is as follows. + +
    +
    + +First, we create a +[StreamingContext](api/streaming/index.html#org.apache.spark.streaming.StreamingContext) object, +which is the main entry point for all streaming +functionality. Besides Spark's configuration, we specify that any DStream will be processed +in 1 second batches. -This guide shows some how to start programming with DStreams. +{% highlight scala %} +// Create a StreamingContext with a SparkConf configuration +val ssc = new StreamingContext(sparkConf, Seconds(1)) +{% endhighlight %} -# Linking with Spark Streaming +Using this context, we then create a new DStream +by specifying the IP address and port of the data server. -Add the following SBT or Maven dependency to your project to use Spark Streaming: +{% highlight scala %} +// Create a DStream that will connect to serverIP:serverPort +val lines = ssc.socketTextStream(serverIP, serverPort) +{% endhighlight %} - groupId = org.apache.spark - artifactId = spark-streaming_{{site.SCALA_VERSION}} - version = {{site.SPARK_VERSION}} +This `lines` DStream represents the stream of data that will be received from the data +server. Each record in this DStream is a line of text. Next, we want to split the lines by +space into words. -For ingesting data from sources like Kafka and Flume, add the corresponding artifact `spark-streaming-xyz_{{site.SCALA_VERSION}}` to the dependencies. For example, `spark-streaming-kafka_{{site.SCALA_VERSION}}` for Kafka, `spark-streaming-flume_{{site.SCALA_VERSION}}`, etc. Please refer to the [Apache repository](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.spark%22%20AND%20v%3A%22{{site.SPARK_VERSION}}%22) for the full list of supported sources / artifacts. +{% highlight scala %} +// Split each line into words +val words = lines.flatMap(_.split(" ")) +{% endhighlight %} -# Initializing Spark Streaming -The first thing a Spark Streaming program must do is create a `StreamingContext` object, which tells Spark how to access a cluster. A `StreamingContext` can be created by using +`flatMap` is a one-to-many DStream operation that creates a new DStream by +generating multiple new records from each record int the source DStream. In this case, +each line will be split into multiple words and the stream of words is represented as the +`words` DStream. Next, we want to count these words. {% highlight scala %} -new StreamingContext(master, appName, batchDuration, [sparkHome], [jars]) +// Count each word in each batch +val pairs = words.map(word => (word, 1)) +val wordCounts = pairs.reduceByKey(_ + _) + +// Print a few of the counts to the console +wordCount.print() {% endhighlight %} -The `master` parameter is a standard [Spark cluster URL](scala-programming-guide.html#master-urls) and can be "local" for local testing. The `appName` is a name of your program, which will be shown on your cluster's web UI. The `batchDuration` is the size of the batches (as explained earlier). This must be set carefully such that the cluster can keep up with the processing of the data streams. Start with something conservative like 5 seconds. See the [Performance Tuning](#setting-the-right-batch-size) section for a detailed discussion. Finally, `sparkHome` and `jars` are optional parameters, which need to be set when running on a cluster to specify the location of your code, as described in the [Spark programming guide](scala-programming-guide.html#deploying-code-on-a-cluster). +The `words` DStream is further mapped (one-to-one transformation) to a DStream of `(word, +1)` pairs, which is then reduced to get the frequency of words in each batch of data. +Finally, `wordCounts.print()` will print a few of the counts generated every second. + +Note that when these lines are executed, Spark Streaming only sets up the computation it +will perform when it is started, and no real processing has started yet. To start the processing +after all the transformations have been setup, we finally call {% highlight scala %} -new SparkConf(conf, batchDuration) +ssc.start() // Start the computation +ssc.awaitTermination() // Wait for the computation to terminate +{% endhighlight %} + +The complete code can be found in the Spark Streaming example +[NetworkWordCount]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala). +
    + +
    +
    + +First, we create a +[JavaStreamingContext](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaStreamingContext) object, +which is the main entry point for all streaming +functionality. Besides Spark's configuration, we specify that any DStream would be processed +in 1 second batches. + +{% highlight java %} +// Create a StreamingContext with a SparkConf configuration +JavaStreamingContext jssc = StreamingContext(sparkConf, new Duration(1000)) +{% endhighlight %} + +Using this context, we then create a new DStream +by specifying the IP address and port of the data server. + +{% highlight java %} +// Create a DStream that will connect to serverIP:serverPort +JavaDStream lines = jssc.socketTextStream(serverIP, serverPort); +{% endhighlight %} + +This `lines` DStream represents the stream of data that will be received from the data +server. Each record in this stream is a line of text. Then, we want to split the the lines by +space into words. + +{% highlight java %} +// Split each line into words +JavaDStream words = lines.flatMap( + new FlatMapFunction() { + @Override public Iterable call(String x) { + return Lists.newArrayList(x.split(" ")); + } + }); +{% endhighlight %} + +`flatMap` is a DStream operation that creates a new DStream by +generating multiple new records from each record in the source DStream. In this case, +each line will be split into multiple words and the stream of words is represented as the +`words` DStream. Note that we defined the transformation using a +[FlatMapFunction](api/core/index.html#org.apache.spark.api.java.function.FlatMapFunction) object. +As we will discover along the way, there are a number of such convenience classes in the Java API +that help define DStream transformations. + +Next, we want to count these words. + +{% highlight java %} +// Count each word in each batch +JavaPairDStream pairs = words.map( + new PairFunction() { + @Override public Tuple2 call(String s) throws Exception { + return new Tuple2(s, 1); + } + }); +JavaPairDStream wordCounts = pairs.reduceByKey( + new Function2() { + @Override public Integer call(Integer i1, Integer i2) throws Exception { + return i1 + i2; + } + }); +wordCount.print(); // Print a few of the counts to the console {% endhighlight %} -where `conf` is a [SparkConf](api/core/index.html#org.apache.spark.SparkConf) -object used for more advanced configuration. In both cases, a [SparkContext](api/core/index.html#org.apache.spark.SparkContext) is created as well which can be accessed with `streamingContext.sparkContext`. +The `words` DStream is further mapped (one-to-one transformation) to a DStream of `(word, +1)` pairs, using a [PairFunction](api/core/index.html#org.apache.spark.api.java.function.PairFunction) +object. Then, it is reduced to get the frequency of words in each batch of data, +using a [Function2](api/core/index.html#org.apache.spark.api.java.function.Function2) object. +Finally, `wordCounts.print()` will print a few of the counts generated every second. + +Note that when these lines are executed, Spark Streaming only sets up the computation it +will perform when it is started, and no real processing has started yet. To start the processing +after all the transformations have been setup, we finally call -# Attaching Input Sources -The StreamingContext is used to creating input streams from data sources: +{% highlight java %} +jssc.start(); // Start the computation +jssc.awaitTermination(); // Wait for the computation to terminate +{% endhighlight %} + +The complete code can be found in the Spark Streaming example +[JavaNetworkWordCount]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/java/org/apache/spark/streaming/examples/JavaNetworkWordCount.java). +
    + +
    +
    + +If you have already [downloaded](index.html#downloading) and [built](index.html#building) Spark, +you can run this example as follows. You will first need to run Netcat +(a small utility found in most Unix-like systems) as a data server by using + +{% highlight bash %} +$ nc -lk 9999 +{% endhighlight %} + +Then, in a different terminal, you can start the example by using + +
    +
    +{% highlight bash %} +$ ./bin/run-example org.apache.spark.streaming.examples.NetworkWordCount local[2] localhost 9999 +{% endhighlight %} +
    +
    +{% highlight bash %} +$ ./bin/run-example org.apache.spark.streaming.examples.JavaNetworkWordCount local[2] localhost 9999 +{% endhighlight %} +
    +
    + + +Then, any lines typed in the terminal running the netcat server will be counted and printed on +screen every second. It will look something like this. + + + + + +
    +{% highlight bash %} +# TERMINAL 1: +# Running Netcat + +$ nc -lk 9999 + +hello world + + + +... +{% endhighlight %} + +{% highlight bash %} +# TERMINAL 2: RUNNING NetworkWordCount or JavaNetworkWordCount + +$ ./bin/run-example org.apache.spark.streaming.examples.NetworkWordCount local[2] localhost 9999 +... +------------------------------------------- +Time: 1357008430000 ms +------------------------------------------- +(hello,1) +(world,1) +... +{% endhighlight %} +
    + +*************************************************************************************************** + +# Basics + +Next, we move beyond the simple example and elaborate on the basics of Spark Streaming that you +need to know to write your streaming applications. + +## Linking + +To write your own Spark Streaming program, you will have to add the following dependency to your + SBT or Maven project: + + groupId = org.apache.spark + artifactId = spark-streaming_{{site.SCALA_BINARY_VERSION}} + version = {{site.SPARK_VERSION}} + +For ingesting data from sources like Kafka and Flume that are not present in the Spark +Streaming core + API, you will have to add the corresponding +artifact `spark-streaming-xyz_{{site.SCALA_BINARY_VERSION}}` to the dependencies. For example, +some of the common ones are as follows. + + + + + + + + + + +
    SourceArtifact
    Kafka spark-streaming-kafka_{{site.SCALA_BINARY_VERSION}}
    Flume spark-streaming-flume_{{site.SCALA_BINARY_VERSION}}
    Twitter spark-streaming-twitter_{{site.SCALA_BINARY_VERSION}}
    ZeroMQ spark-streaming-zeromq_{{site.SCALA_BINARY_VERSION}}
    MQTT spark-streaming-mqtt_{{site.SCALA_BINARY_VERSION}}
    + +For an up-to-date list, please refer to the +[Apache repository](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22org.apache.spark%22%20AND%20v%3A%22{{site.SPARK_VERSION}}%22) +for the full list of supported sources and artifacts. + +## Initializing + +
    +
    + +To initialize a Spark Streaming program in Scala, a +[`StreamingContext`](api/streaming/index.html#org.apache.spark.streaming.StreamingContext) +object has to be created, which is the main entry point of all Spark Streaming functionality. +A `StreamingContext` object can be created by using {% highlight scala %} -// Assuming ssc is the StreamingContext -ssc.textFileStream(directory) // Creates a stream that monitors and processes new files in a HDFS directory -ssc.socketStream(hostname, port) // Creates a stream that uses a TCP socket to read data from hostname:port +new StreamingContext(master, appName, batchDuration, [sparkHome], [jars]) {% endhighlight %} +
    +
    -The core Spark Streaming API provides input streams for files, sockets, and Akka actors. Additional functionality for Kafka, Flume, ZeroMQ, Twitter, etc. can be imported by adding the right dependencies as explained in the [linking](#linking-with-spark-streaming) section. +To initialize a Spark Streaming program in Java, a +[`JavaStreamingContext`](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaStreamingContext) +object has to be created, which is the main entry point of all Spark Streaming functionality. +A `JavaStreamingContext` object can be created by using -# DStream Operations -Data received from the input streams can be processed using _DStream operations_. There are two kinds of operations - _transformations_ and _output operations_. Similar to RDD transformations, DStream transformations operate on one or more DStreams to create new DStreams with transformed data. After applying a sequence of transformations to the input streams, output operations need to called, which write data out to an external data sink like a file system or a database. +{% highlight scala %} +new JavaStreamingContext(master, appName, batchInterval, [sparkHome], [jars]) +{% endhighlight %} +
    +
    + +The `master` parameter is a standard [Spark cluster URL](scala-programming-guide.html#master-urls) +and can be "local" for local testing. The `appName` is a name of your program, +which will be shown on your cluster's web UI. The `batchInterval` is the size of the batches, +as explained earlier. Finally, the last two parameters are needed to deploy your code to a cluster + if running in distributed mode, as described in the + [Spark programming guide](scala-programming-guide.html#deploying-code-on-a-cluster). + Additionally, the underlying SparkContext can be accessed as +`streamingContext.sparkContext`. + +The batch interval must be set based on the latency requirements of your application +and available cluster resources. See the [Performance Tuning](#setting-the-right-batch-size) +section for more details. + +## DStreams +*Discretized Stream* or *DStream* is the basic abstraction provided by Spark Streaming. +It represents a continuous stream of data, either the input data stream received from source, +or the processed data stream generated by transforming the input stream. Internally, +it is represented by a continuous sequence of RDDs, which is Spark's abstraction of an immutable, +distributed dataset. Each RDD in a DStream contains data from a certain interval, +as shown in the following figure. + +

    + Spark Streaming +

    + +Any operation applied on a DStream translates to operations on the underlying RDDs. For example, +in the [earlier example](#a-quick-example) of converting a stream of lines to words, +the `flatmap` operation is applied on each RDD in the `lines` DStream to generate the RDDs of the + `words` DStream. This is shown the following figure. + +

    + Spark Streaming +

    + + +These underlying RDD transformations are computed by the Spark engine. The DStream operations +hide most of these details and provides the developer with higher-level API for convenience. +These operations are discussed in detail in later sections. + +## Input Sources + +We have already taken a look at the `streamingContext.socketTextStream(...)` in the [quick +example](#a-quick-example) which creates a DStream from text +data received over a TCP socket connection. Besides sockets, the core Spark Streaming API provides +methods for creating DStreams from files and Akka actors as input sources. + +Specifically, for files, the DStream can be created as + +
    +
    +{% highlight scala %} +streamingContext.fileStream(dataDirectory) +{% endhighlight %} +
    +
    +{% highlight java %} +javaStreamingContext.fileStream(dataDirectory); +{% endhighlight %} +
    +
    + +Spark Streaming will monitor the directory `dataDirectory` for any Hadoop-compatible filesystem +and process any files created in that directory. Note that + + * The files must have the same data format. + * The files must be created in the `dataDirectory` by atomically *moving* or *renaming* them into + the data directory. + * Once moved the files must not be changed. + +For more details on streams from files, Akka actors and sockets, +see the API documentations of the relevant functions in +[StreamingContext](api/streaming/index.html#org.apache.spark.streaming.StreamingContext) for +Scala and [JavaStreamingContext](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaStreamingContext) + for Java. + +Additional functionality for creating DStreams from sources such as Kafka, Flume, and Twitter +can be imported by adding the right dependencies as explained in an +[earlier](#linking) section. To take the +case of Kafka, after adding the artifact `spark-streaming-kafka_{{site.SCALA_BINARY_VERSION}}` to the +project dependencies, you can create a DStream from Kafka as + +
    +
    +{% highlight scala %} +import org.apache.spark.streaming.kafka._ +KafkaUtils.createStream(streamingContext, kafkaParams, ...) +{% endhighlight %} +
    +
    +{% highlight java %} +import org.apache.spark.streaming.kafka.* +KafkaUtils.createStream(javaStreamingContext, kafkaParams, ...); +{% endhighlight %} +
    +
    -## Transformations +For more details on these additional sources, see the corresponding [API documentation] +(#where-to-go-from-here). Furthermore, you can also implement your own custom receiver +for your sources. See the [Custom Receiver Guide](streaming-custom-receivers.html). -DStreams support many of the transformations available on normal Spark RDD's: +## Operations +There are two kinds of DStream operations - _transformations_ and _output operations_. Similar to +RDD transformations, DStream transformations operate on one or more DStreams to create new DStreams +with transformed data. After applying a sequence of transformations to the input streams, output +operations need to called, which write data out to an external data sink, such as a filesystem or a +database. + +### Transformations +DStreams support many of the transformations available on normal Spark RDD's. Some of the +common ones are as follows. - + - - - - - + - + - - + + @@ -82,329 +465,681 @@ DStreams support many of the transformations available on normal Spark RDD's: - + - + - + - - - - - + - + - + - + - + - - - -
    TransformationMeaning
    TransformationMeaning
    map(func) Returns a new DStream formed by passing each element of the source DStream through a function func.
    filter(func) Returns a new DStream formed by selecting those elements of the source DStream on which func returns true. Return a new DStream by passing each element of the source DStream through a + function func.
    flatMap(func) Similar to map, but each input item can be mapped to 0 or more output items (so func should return a Seq rather than a single item). Similar to map, but each input item can be mapped to 0 or more output items.
    mapPartitions(func) Similar to map, but runs separately on each partition (block) of the DStream, so func must be of type - Iterator[T] => Iterator[U] when running on an DStream of type T. filter(func) Return a new DStream by selecting only the records of the source DStream on which + func returns true.
    repartition(numPartitions)
    union(otherStream) Return a new DStream that contains the union of the elements in the source DStream and the argument DStream. Return a new DStream that contains the union of the elements in the source DStream and + otherDStream.
    count() Returns a new DStream of single-element RDDs by counting the number of elements in each RDD of the source DStream. Return a new DStream of single-element RDDs by counting the number of elements in each RDD + of the source DStream.
    reduce(func) Returns a new DStream of single-element RDDs by aggregating the elements in each RDD of the source DStream using a function func (which takes two arguments and returns one). The function should be associative so that it can be computed in parallel. Return a new DStream of single-element RDDs by aggregating the elements in each RDD of the + source DStream using a function func (which takes two arguments and returns one). + The function should be associative so that it can be computed in parallel.
    countByValue() When called on a DStream of elements of type K, returns a new DStream of (K, Long) pairs where the value of each key is its frequency in each RDD of the source DStream.
    groupByKey([numTasks]) When called on a DStream of (K, V) pairs, returns a new DStream of (K, Seq[V]) pairs by grouping together all the values of each key in the RDDs of the source DStream.
    - Note: By default, this uses Spark's default number of parallel tasks (2 for local machine, 8 for a cluster) to do the grouping. You can pass an optional numTasks argument to set a different number of tasks. -
    When called on a DStream of elements of type K, return a new DStream of (K, Long) pairs + where the value of each key is its frequency in each RDD of the source DStream.
    reduceByKey(func, [numTasks]) When called on a DStream of (K, V) pairs, returns a new DStream of (K, V) pairs where the values for each key are aggregated using the given reduce function. Like in groupByKey, the number of reduce tasks is configurable through an optional second argument. When called on a DStream of (K, V) pairs, return a new DStream of (K, V) pairs where the + values for each key are aggregated using the given reduce function. Note: By default, + this uses Spark's default number of parallel tasks (2 for local machine, 8 for a cluster) to + do the grouping. You can pass an optional numTasks argument to set a different + number of tasks.
    join(otherStream, [numTasks]) When called on two DStreams of (K, V) and (K, W) pairs, returns a new DStream of (K, (V, W)) pairs with all pairs of elements for each key. When called on two DStreams of (K, V) and (K, W) pairs, return a new DStream of (K, (V, W)) + pairs with all pairs of elements for each key.
    cogroup(otherStream, [numTasks]) When called on DStream of (K, V) and (K, W) pairs, returns a new DStream of (K, Seq[V], Seq[W]) tuples. When called on DStream of (K, V) and (K, W) pairs, return a new DStream of + (K, Seq[V], Seq[W]) tuples.
    transform(func) Returns a new DStream by applying func (a RDD-to-RDD function) to every RDD of the stream. This can be used to do arbitrary RDD operations on the DStream. Return a new DStream by applying a RDD-to-RDD function to every RDD of the source DStream. + This can be used to do arbitrary RDD operations on the DStream.
    updateStateByKey(func) Return a new "state" DStream where the state for each key is updated by applying the given function on the previous state of the key and the new values of each key. This can be used to track session state by using the session-id as the key and updating the session state as new data is received.
    - -Spark Streaming features windowed computations, which allow you to apply transformations over a sliding window of data. All window functions take a windowDuration, which represents the width of the window and a slideTime, which represents the frequency during which the window is calculated. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +
    TransformationMeaning
    window(windowDuration, slideDuration) Return a new DStream which is computed based on windowed batches of the source DStream. windowDuration is the width of the window and slideTime is the frequency during which the window is calculated. Both times must be multiples of the batch interval. -
    countByWindow(windowDuration, slideDuration) Return a sliding count of elements in the stream. windowDuration and slideDuration are exactly as defined in window(). -
    reduceByWindow(func, windowDuration, slideDuration) Return a new single-element stream, created by aggregating elements in the stream over a sliding interval using func. The function should be associative so that it can be computed correctly in parallel. windowDuration and slideDuration are exactly as defined in window(). -
    groupByKeyAndWindow(windowDuration, slideDuration, [numTasks]) - When called on a DStream of (K, V) pairs, returns a new DStream of (K, Seq[V]) pairs by grouping together values of each key over batches in a sliding window.
    -Note: By default, this uses Spark's default number of parallel tasks (2 for local machine, 8 for a cluster) to do the grouping. You can pass an optional numTasks argument to set a different number of tasks.
    reduceByKeyAndWindow(func, windowDuration, slideDuration, [numTasks]) When called on a DStream of (K, V) pairs, returns a new DStream of (K, V) pairs where the values for each key are aggregated using the given reduce function func over batches in a sliding window. Like in groupByKeyAndWindow, the number of reduce tasks is configurable through an optional second argument. - windowDuration and slideDuration are exactly as defined in window(). -
    reduceByKeyAndWindow(func, invFunc, windowDuration, slideDuration, [numTasks]) A more efficient version of the above reduceByKeyAndWindow() where the reduce value of each window is calculated - incrementally using the reduce values of the previous window. This is done by reducing the new data that enter the sliding window, and "inverse reducing" the old data that leave the window. An example would be that of "adding" and "subtracting" counts of keys as the window slides. However, it is applicable to only "invertible reduce functions", that is, those reduce functions which have a corresponding "inverse reduce" function (taken as parameter invFunc. Like in groupByKeyAndWindow, the number of reduce tasks is configurable through an optional second argument. - windowDuration and slideDuration are exactly as defined in window(). -
    countByValueAndWindow(windowDuration, slideDuration, [numTasks]) When called on a DStream of (K, V) pairs, returns a new DStream of (K, Long) pairs where the value of each key is its frequency within a sliding window. Like in groupByKeyAndWindow, the number of reduce tasks is configurable through an optional second argument. - windowDuration and slideDuration are exactly as defined in window(). - Return a new "state" DStream where the state for each key is updated by applying the + given function on the previous state of the key and the new values for the key. This can be + used to maintain arbitrary state data for each ket.
    -A complete list of DStream operations is available in the API documentation of [DStream](api/streaming/index.html#org.apache.spark.streaming.dstream.DStream) and [PairDStreamFunctions](api/streaming/index.html#org.apache.spark.streaming.dstream.PairDStreamFunctions). +The last two transformations are worth highlighting again. -## Output Operations -When an output operator is called, it triggers the computation of a stream. Currently the following output operators are defined: +

    UpdateStateByKey Operation

    - - - - - - +The `updateStateByKey` operation allows +you to main arbitrary stateful computation, where you want to maintain some state data and +continuously update it with new information. To use this, you will have to do two steps. - - - - +1. Define the state - The state can be of arbitrary data type. +1. Define the state update function - Specify with a function how to update the state using the +previous state and the new values from input stream. - - - - +Let's illustrate this with an example. Say you want to maintain a running count of each word +seen in a text data stream. Here, the running count is the state and it is an integer. We +define the update function as - - - - +
    +
    -
    - - - +{% highlight scala %} +def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = { + val newCount = ... // add the new values with the previous running count to get the new count + Some(newCount) +} +{% endhighlight %} -
    OperatorMeaning
    foreachRDD(func) The fundamental output operator. Applies a function, func, to each RDD generated from the stream. This function should have side effects, such as printing output, saving the RDD to external files, or writing it over the network to an external system.
    print() Prints first ten elements of every batch of data in a DStream on the driver.
    saveAsObjectFiles(prefix, [suffix]) Save this DStream's contents as a SequenceFile of serialized objects. The file name at each batch interval is generated based on prefix and suffix: "prefix-TIME_IN_MS[.suffix]". -
    saveAsTextFiles(prefix, [suffix]) Save this DStream's contents as a text files. The file name at each batch interval is generated based on prefix and suffix: "prefix-TIME_IN_MS[.suffix]".
    saveAsHadoopFiles(prefix, [suffix]) Save this DStream's contents as a Hadoop file. The file name at each batch interval is generated based on prefix and suffix: "prefix-TIME_IN_MS[.suffix]".
    +This is applied on a DStream containing words (say, the `pairs` DStream containing `(word, +1)` pairs in the [earlier example](#a-quick-example)). -# Starting the Streaming computation -All the above DStream operations are completely lazy, that is, the operations will start executing only after the context is started by using {% highlight scala %} -ssc.start() +val runningCounts = pairs.updateStateByKey[Int](updateFunction _) {% endhighlight %} -Conversely, the computation can be stopped by using -{% highlight scala %} -ssc.stop() + +
    + +{% highlight java %} +Function2, Optional, Optional> updateFunction = + new Function2, Optional, Optional>() { + @Override public Optional call(List values, Optional state) { + Integer newSum = ... // add the new values with the previous running count to get the new count + return Optional.of(newSum) + } + } {% endhighlight %} -# Example -A simple example to start off is the [NetworkWordCount](https://github.com/apache/incubator-spark/tree/master/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala). This example counts the words received from a network server every second. Given below is the relevant sections of the source code. You can find the full source code in `/streaming/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala` . +This is applied on a DStream containing words (say, the `pairs` DStream containing `(word, +1)` pairs in the [quick example](#a-quick-example)). -{% highlight scala %} -import org.apache.spark.streaming.{Seconds, StreamingContext} -import StreamingContext._ -... +{% highlight java %} +JavaPairDStream runningCounts = pairs.updateStateByKey(updateFunction); +{% endhighlight %} -// Create the context and set up a network input stream to receive from a host:port -val ssc = new StreamingContext(args(0), "NetworkWordCount", Seconds(1)) -val lines = ssc.socketTextStream(args(1), args(2).toInt) +
    + -// Split the lines into words, count them, and print some of the counts on the master -val words = lines.flatMap(_.split(" ")) -val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) -wordCounts.print() +The update function will be called for each word, with `newValues` having a sequence of 1's (from +the `(word, 1)` pairs) and the `runningCount` having the previous count. For the complete +Scala code, take a look at the example +[StatefulNetworkWordCount]({{site.SPARK_GITHUB_URL}}/blob/master/examples/src/main/scala/org/apache/spark/streaming/examples/StatefulNetworkWordCount.scala). -// Start the computation -ssc.start() -{% endhighlight %} +

    Transform Operation

    -The `socketTextStream` returns a DStream of text data received from a TCP server socket. The `lines` DStream is _transformed_ into a DStream using the `flatMap` operation, where each line is split into words. This `words` DStream is then mapped to a DStream of `(word, 1)` pairs, which is finally reduced to get the word counts. `wordCounts.print()` will print 10 of the counts generated every second. +The `transform` operation (along with its variations like `transformWith`) allows +arbitrary RDD-to-RDD functions to be applied on a DStream. It can be used to apply any RDD +operation that is not exposed in the DStream API. +For example, the functionality of joining every batch in a data stream +with another dataset is not directly exposed in the DStream API. However, +you can easily use `transform` to do this. This enables very powerful possibilities. For example, +if you want to do real-time data cleaning by joining the input data stream with precomputed +spam information (maybe generated with Spark as well) and then filtering based on it. -To run this example on your local machine, you need to first run a Netcat server by using +
    +
    -{% highlight bash %} -$ nc -lk 9999 +{% highlight scala %} +val spamInfoRDD = sparkContext.hadoopFile(...) // RDD containing spam information + +val cleanedDStream = inputDStream.transform(rdd => { + rdd.join(spamInfoRDD).filter(...) // join data stream with spam information to do data cleaning + ... +}) {% endhighlight %} -Then, in a different terminal, you can start NetworkWordCount by using +
    +
    -{% highlight bash %} -$ ./bin/run-example org.apache.spark.streaming.examples.NetworkWordCount local[2] localhost 9999 +{% highlight java %} +// RDD containing spam information +JavaPairRDD spamInfoRDD = javaSparkContext.hadoopFile(...); + +JavaPairDStream cleanedDStream = inputDStream.transform( + new Function, JavaPairRDD>() { + @Override public JavaPairRDD call(JavaPairRDD rdd) throws Exception { + rdd.join(spamInfoRDD).filter(...) // join data stream with spam information to do data cleaning + ... + } + }); {% endhighlight %} -This will make NetworkWordCount connect to the netcat server. Any lines typed in the terminal running the netcat server will be counted and printed on screen. +
    +
    - - - -
    -{% highlight bash %} -# TERMINAL 1 -# RUNNING NETCAT +In fact, you can also use [machine learning](mllib-guide.html) and +[graph computation](graphx-programming-guide.html) algorithms in the `transform` method. -$ nc -lk 9999 -hello world +

    Window Operations

    +Finally, Spark Streaming also provides *windowed computations*, which allow you to apply +transformations over a sliding window of data. This following figure illustrates this sliding +window. +

    + Spark Streaming +

    +As shown in the figure, every time the window *slides* over a source DStream, +the source RDDs that fall within the window are combined and operated upon to produce the +RDDs of the windowed DStream. In this specific case, the operation is applied over last 3 time +units of data, and slides by 2 time units. This shows that any window-based operation needs to +specify two parameters. + * window length - The duration of the window (3 in the figure) + * slide interval - The interval at which the window-based operation is performed (2 in + the figure). -... +These two parameters must be multiples of the batch interval of the source DStream (1 in the +figure). + +Let's illustrate the window operations with an example. Say, you want to extend the +[earlier example](#a-quick-example) by generating word counts over last 30 seconds of data, +every 10 seconds. To do this, we have to apply the `reduceByKey` operation on the `pairs` DStream of +`(word, 1)` pairs over the last 30 seconds of data. This is done using the +operation `reduceByKeyAndWindow`. + +
    +
    + +{% highlight scala %} +// Reduce last 30 seconds of data, every 10 seconds +val windowedWordCounts = pairs.reduceByKeyAndWindow(_ + _, Seconds(30), Seconds(10)) {% endhighlight %} -
    -{% highlight bash %} -# TERMINAL 2: RUNNING NetworkWordCount -... -------------------------------------------- -Time: 1357008430000 ms -------------------------------------------- -(hello,1) -(world,1) -... + +
    + +{% highlight java %} +// Reduce function adding two integers, defined separately for clarity +Function2 reduceFunc = new Function2() { + @Override public Integer call(Integer i1, Integer i2) throws Exception { + return i1 + i2; + } +}; + +// Reduce last 30 seconds of data, every 10 seconds +JavaPairDStream windowedWordCounts = pair.reduceByKeyAndWindow(reduceFunc, new Duration(30000), new Duration(10000)); {% endhighlight %} -
    -You can find more examples in `/streaming/src/main/scala/org/apache/spark/streaming/examples/`. They can be run in the similar manner using `./bin/run-example org.apache.spark.streaming.examples....` . Executing without any parameter would give the required parameter list. Further explanation to run them can be found in comments in the files. + + -# DStream Persistence -Similar to RDDs, DStreams also allow developers to persist the stream's data in memory. That is, using `persist()` method on a DStream would automatically persist every RDD of that DStream in memory. This is useful if the data in the DStream will be computed multiple times (e.g., multiple operations on the same data). For window-based operations like `reduceByWindow` and `reduceByKeyAndWindow` and state-based operations like `updateStateByKey`, this is implicitly true. Hence, DStreams generated by window-based operations are automatically persisted in memory, without the developer calling `persist()`. +Some of the common window-based operations are as follows. All of these operations take the +said two parameters - windowLength and slideInterval. -For input streams that receive data from the network (that is, subclasses of NetworkInputDStream like FlumeInputDStream and KafkaInputDStream), the default persistence level is set to replicate the data to two nodes for fault-tolerance. + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    TransformationMeaning
    window(windowLength, slideInterval) Return a new DStream which is computed based on windowed batches of the source DStream. +
    countByWindow(windowLength, slideInterval) Return a sliding window count of elements in the stream. +
    reduceByWindow(func, windowLength, slideInterval) Return a new single-element stream, created by aggregating elements in the stream over a + sliding interval using func. The function should be associative so that it can be computed + correctly in parallel. +
    reduceByKeyAndWindow(func, windowLength, slideInterval, + [numTasks]) When called on a DStream of (K, V) pairs, returns a new DStream of (K, V) + pairs where the values for each key are aggregated using the given reduce function func + over batches in a sliding window. Note: By default, this uses Spark's default number of + parallel tasks (2 for local machine, 8 for a cluster) to do the grouping. You can pass an optional + numTasks argument to set a different number of tasks. +
    reduceByKeyAndWindow(func, invFunc, windowLength, + slideInterval, [numTasks]) A more efficient version of the above reduceByKeyAndWindow() where the reduce + value of each window is calculated incrementally using the reduce values of the previous window. + This is done by reducing the new data that enter the sliding window, and "inverse reducing" the + old data that leave the window. An example would be that of "adding" and "subtracting" counts + of keys as the window slides. However, it is applicable to only "invertible reduce functions", + that is, those reduce functions which have a corresponding "inverse reduce" function (taken as + parameter invFunc. Like in reduceByKeyAndWindow, the number of reduce tasks + is configurable through an optional argument. +
    countByValueAndWindow(windowLength, + slideInterval, [numTasks]) When called on a DStream of (K, V) pairs, returns a new DStream of (K, Long) pairs where the + value of each key is its frequency within a sliding window. Like in + reduceByKeyAndWindow, the number of reduce tasks is configurable through an + optional argument. +
    -Note that, unlike RDDs, the default persistence level of DStreams keeps the data serialized in memory. This is further discussed in the [Performance Tuning](#memory-tuning) section. More information on different persistence levels can be found in [Spark Programming Guide](scala-programming-guide.html#rdd-persistence). +### Output Operations +When an output operator is called, it triggers the computation of a stream. Currently the following +output operators are defined: -# RDD Checkpointing within DStreams -A _stateful operation_ is one which operates over multiple batches of data. This includes all window-based operations and the `updateStateByKey` operation. + + + + + + + + + + + + + + + + + + + + + + + +
    Output OperationMeaning
    print() Prints first ten elements of every batch of data in a DStream on the driver.
    foreachRDD(func) The fundamental output operator. Applies a function, func, to each RDD generated from + the stream. This function should have side effects, such as printing output, saving the RDD to + external files, or writing it over the network to an external system.
    saveAsObjectFiles(prefix, [suffix]) Save this DStream's contents as a SequenceFile of serialized objects. The file + name at each batch interval is generated based on prefix and + suffix: "prefix-TIME_IN_MS[.suffix]". +
    saveAsTextFiles(prefix, [suffix]) Save this DStream's contents as a text files. The file name at each batch interval is + generated based on prefix and suffix: "prefix-TIME_IN_MS[.suffix]".
    saveAsHadoopFiles(prefix, [suffix]) Save this DStream's contents as a Hadoop file. The file name at each batch interval is + generated based on prefix and suffix: "prefix-TIME_IN_MS[.suffix]".
    -Because stateful operations have a dependency on previous batches of data, they continuously accumulate metadata over time. To clear this metadata, streaming supports periodic _checkpointing_ by saving intermediate data to HDFS. Note that checkpointing also incurs the cost of saving to HDFS which may cause the corresponding batch to take longer to process. Hence, the interval of checkpointing needs to be set carefully. At small batch sizes (say 1 second), checkpointing every batch may significantly reduce operation throughput. Conversely, checkpointing too slowly causes the lineage and task sizes to grow which may have detrimental effects. Typically, a checkpoint interval of 5 - 10 times of sliding interval of a DStream is good setting to try. -To enable checkpointing, the developer has to provide the HDFS path to which RDD will be saved. This is done by using +The complete list of DStream operations is available in the API documentation. For the Scala API, +see [DStream](api/streaming/index.html#org.apache.spark.streaming.dstream.DStream) +and [PairDStreamFunctions](api/streaming/index.html#org.apache.spark.streaming.dstream.PairDStreamFunctions). +For the Java API, see [JavaDStream](api/streaming/index.html#org.apache.spark.streaming.api.java.dstream.DStream) +and [JavaPairDStream](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaPairDStream). +Specifically for the Java API, see [Spark's Java programming guide](java-programming-guide.html) +for more information. + +## Persistence +Similar to RDDs, DStreams also allow developers to persist the stream's data in memory. That is, +using `persist()` method on a DStream would automatically persist every RDD of that DStream in +memory. This is useful if the data in the DStream will be computed multiple times (e.g., multiple +operations on the same data). For window-based operations like `reduceByWindow` and +`reduceByKeyAndWindow` and state-based operations like `updateStateByKey`, this is implicitly true. +Hence, DStreams generated by window-based operations are automatically persisted in memory, without +the developer calling `persist()`. + +For input streams that receive data over the network (such as, Kafka, Flume, sockets, etc.), the +default persistence level is set to replicate the data to two nodes for fault-tolerance. + +Note that, unlike RDDs, the default persistence level of DStreams keeps the data serialized in +memory. This is further discussed in the [Performance Tuning](#memory-tuning) section. More +information on different persistence levels can be found in +[Spark Programming Guide](scala-programming-guide.html#rdd-persistence). + +## RDD Checkpointing +A _stateful operation_ is one which operates over multiple batches of data. This includes all +window-based operations and the `updateStateByKey` operation. Since stateful operations have a +dependency on previous batches of data, they continuously accumulate metadata over time. +To clear this metadata, streaming supports periodic _checkpointing_ by saving intermediate data +to HDFS. Note that checkpointing also incurs the cost of saving to HDFS which may cause the +corresponding batch to take longer to process. Hence, the interval of checkpointing needs to be +set carefully. At small batch sizes (say 1 second), checkpointing every batch may significantly +reduce operation throughput. Conversely, checkpointing too slowly causes the lineage and task +sizes to grow which may have detrimental effects. Typically, a checkpoint interval of 5 - 10 +times of sliding interval of a DStream is good setting to try. + +To enable checkpointing, the developer has to provide the HDFS path to which RDD will be saved. +This is done by using {% highlight scala %} -ssc.checkpoint(hdfsPath) // assuming ssc is the StreamingContext +ssc.checkpoint(hdfsPath) // assuming ssc is the StreamingContext or JavaStreamingContext {% endhighlight %} The interval of checkpointing of a DStream can be set by using {% highlight scala %} -dstream.checkpoint(checkpointInterval) // checkpointInterval must be a multiple of slide duration of dstream +dstream.checkpoint(checkpointInterval) {% endhighlight %} -For DStreams that must be checkpointed (that is, DStreams created by `updateStateByKey` and `reduceByKeyAndWindow` with inverse function), the checkpoint interval of the DStream is by default set to a multiple of the DStream's sliding interval such that its at least 10 seconds. +For DStreams that must be checkpointed (that is, DStreams created by `updateStateByKey` and +`reduceByKeyAndWindow` with inverse function), the checkpoint interval of the DStream is by +default set to a multiple of the DStream's sliding interval such that its at least 10 seconds. - -## Custom Receivers -Spark comes with a built in support for most common usage scenarios where input stream source can be either a network socket stream to support for a few message queues. Apart from that it is also possible to supply your own custom receiver via a convenient API. Find more details at [Custom Receiver Guide](streaming-custom-receivers.html). +*************************************************************************************************** # Performance Tuning -Getting the best performance of a Spark Streaming application on a cluster requires a bit of tuning. This section explains a number of the parameters and configurations that can tuned to improve the performance of you application. At a high level, you need to consider two things: +Getting the best performance of a Spark Streaming application on a cluster requires a bit of +tuning. This section explains a number of the parameters and configurations that can tuned to +improve the performance of you application. At a high level, you need to consider two things: +
      -
    1. Reducing the processing time of each batch of data by efficiently using cluster resources.
    2. -
    3. Setting the right batch size such that the data processing can keep up with the data ingestion.
    4. +
    5. + Reducing the processing time of each batch of data by efficiently using cluster resources. +
    6. +
    7. + Setting the right batch size such that the data processing can keep up with the data ingestion. +
    ## Reducing the Processing Time of each Batch -There are a number of optimizations that can be done in Spark to minimize the processing time of each batch. These have been discussed in detail in [Tuning Guide](tuning.html). This section highlights some of the most important ones. +There are a number of optimizations that can be done in Spark to minimize the processing time of +each batch. These have been discussed in detail in [Tuning Guide](tuning.html). This section +highlights some of the most important ones. ### Level of Parallelism -Cluster resources maybe under-utilized if the number of parallel tasks used in any stage of the computation is not high enough. For example, for distributed reduce operations like `reduceByKey` and `reduceByKeyAndWindow`, the default number of parallel tasks is 8. You can pass the level of parallelism as an argument (see the [`PairDStreamFunctions`](api/streaming/index.html#org.apache.spark.PairDStreamFunctions) documentation), or set the [config property](configuration.html#spark-properties) `spark.default.parallelism` to change the default. +Cluster resources maybe under-utilized if the number of parallel tasks used in any stage of the +computation is not high enough. For example, for distributed reduce operations like `reduceByKey` +and `reduceByKeyAndWindow`, the default number of parallel tasks is 8. You can pass the level of +parallelism as an argument (see the +[`PairDStreamFunctions`](api/streaming/index.html#org.apache.spark.streaming.dstream.PairDStreamFunctions) +documentation), or set the [config property](configuration.html#spark-properties) +`spark.default.parallelism` to change the default. ### Data Serialization -The overhead of data serialization can be significant, especially when sub-second batch sizes are to be achieved. There are two aspects to it. +The overhead of data serialization can be significant, especially when sub-second batch sizes are + to be achieved. There are two aspects to it. -* **Serialization of RDD data in Spark**: Please refer to the detailed discussion on data serialization in the [Tuning Guide](tuning.html). However, note that unlike Spark, by default RDDs are persisted as serialized byte arrays to minimize pauses related to GC. +* **Serialization of RDD data in Spark**: Please refer to the detailed discussion on data + serialization in the [Tuning Guide](tuning.html). However, note that unlike Spark, by default + RDDs are persisted as serialized byte arrays to minimize pauses related to GC. -* **Serialization of input data**: To ingest external data into Spark, data received as bytes (say, from the network) needs to deserialized from bytes and re-serialized into Spark's serialization format. Hence, the deserialization overhead of input data may be a bottleneck. +* **Serialization of input data**: To ingest external data into Spark, data received as bytes + (say, from the network) needs to deserialized from bytes and re-serialized into Spark's + serialization format. Hence, the deserialization overhead of input data may be a bottleneck. ### Task Launching Overheads -If the number of tasks launched per second is high (say, 50 or more per second), then the overhead of sending out tasks to the slaves maybe significant and will make it hard to achieve sub-second latencies. The overhead can be reduced by the following changes: +If the number of tasks launched per second is high (say, 50 or more per second), then the overhead +of sending out tasks to the slaves maybe significant and will make it hard to achieve sub-second +latencies. The overhead can be reduced by the following changes: -* **Task Serialization**: Using Kryo serialization for serializing tasks can reduced the task sizes, and therefore reduce the time taken to send them to the slaves. +* **Task Serialization**: Using Kryo serialization for serializing tasks can reduced the task + sizes, and therefore reduce the time taken to send them to the slaves. -* **Execution mode**: Running Spark in Standalone mode or coarse-grained Mesos mode leads to better task launch times than the fine-grained Mesos mode. Please refer to the [Running on Mesos guide](running-on-mesos.html) for more details. -These changes may reduce batch processing time by 100s of milliseconds, thus allowing sub-second batch size to be viable. +* **Execution mode**: Running Spark in Standalone mode or coarse-grained Mesos mode leads to + better task launch times than the fine-grained Mesos mode. Please refer to the + [Running on Mesos guide](running-on-mesos.html) for more details. -## Setting the Right Batch Size -For a Spark Streaming application running on a cluster to be stable, the processing of the data streams must keep up with the rate of ingestion of the data streams. Depending on the type of computation, the batch size used may have significant impact on the rate of ingestion that can be sustained by the Spark Streaming application on a fixed cluster resources. For example, let us consider the earlier WordCountNetwork example. For a particular data rate, the system may be able to keep up with reporting word counts every 2 seconds (i.e., batch size of 2 seconds), but not every 500 milliseconds. +These changes may reduce batch processing time by 100s of milliseconds, +thus allowing sub-second batch size to be viable. -A good approach to figure out the right batch size for your application is to test it with a conservative batch size (say, 5-10 seconds) and a low data rate. To verify whether the system is able to keep up with data rate, you can check the value of the end-to-end delay experienced by each processed batch (in the Spark master logs, find the line having the phrase "Total delay"). If the delay is maintained to be less than the batch size, then system is stable. Otherwise, if the delay is continuously increasing, it means that the system is unable to keep up and it therefore unstable. Once you have an idea of a stable configuration, you can try increasing the data rate and/or reducing the batch size. Note that momentary increase in the delay due to temporary data rate increases maybe fine as long as the delay reduces back to a low value (i.e., less than batch size). +## Setting the Right Batch Size +For a Spark Streaming application running on a cluster to be stable, the processing of the data +streams must keep up with the rate of ingestion of the data streams. Depending on the type of +computation, the batch size used may have significant impact on the rate of ingestion that can be +sustained by the Spark Streaming application on a fixed cluster resources. For example, let us +consider the earlier WordCountNetwork example. For a particular data rate, the system may be able +to keep up with reporting word counts every 2 seconds (i.e., batch size of 2 seconds), but not +every 500 milliseconds. + +A good approach to figure out the right batch size for your application is to test it with a +conservative batch size (say, 5-10 seconds) and a low data rate. To verify whether the system +is able to keep up with data rate, you can check the value of the end-to-end delay experienced +by each processed batch (either look for "Total delay" in Spark driver log4j logs, or use the +[StreamingListener](api/streaming/index.html#org.apache.spark.streaming.scheduler.StreamingListener) +interface). +If the delay is maintained to be comparable to the batch size, then system is stable. Otherwise, +if the delay is continuously increasing, it means that the system is unable to keep up and it +therefore unstable. Once you have an idea of a stable configuration, you can try increasing the +data rate and/or reducing the batch size. Note that momentary increase in the delay due to +temporary data rate increases maybe fine as long as the delay reduces back to a low value +(i.e., less than batch size). ## 24/7 Operation -By default, Spark does not forget any of the metadata (RDDs generated, stages processed, etc.). But for a Spark Streaming application to operate 24/7, it is necessary for Spark to do periodic cleanup of it metadata. This can be enabled by setting the [config property](configuration.html#spark-properties) `spark.cleaner.ttl` to the number of seconds you want any metadata to persist. For example, setting `spark.cleaner.ttl` to 600 would cause Spark periodically cleanup all metadata and persisted RDDs that are older than 10 minutes. Note, that this property needs to be set before the SparkContext is created. - -This value is closely tied with any window operation that is being used. Any window operation would require the input data to be persisted in memory for at least the duration of the window. Hence it is necessary to set the delay to at least the value of the largest window operation used in the Spark Streaming application. If this delay is set too low, the application will throw an exception saying so. +By default, Spark does not forget any of the metadata (RDDs generated, stages processed, etc.). +But for a Spark Streaming application to operate 24/7, it is necessary for Spark to do periodic +cleanup of it metadata. This can be enabled by setting the +[configuration property](configuration.html#spark-properties) `spark.cleaner.ttl` to the number of +seconds you want any metadata to persist. For example, setting `spark.cleaner.ttl` to 600 would +cause Spark periodically cleanup all metadata and persisted RDDs that are older than 10 minutes. +Note, that this property needs to be set before the SparkContext is created. + +This value is closely tied with any window operation that is being used. Any window operation +would require the input data to be persisted in memory for at least the duration of the window. +Hence it is necessary to set the delay to at least the value of the largest window operation used +in the Spark Streaming application. If this delay is set too low, the application will throw an +exception saying so. + +## Monitoring +Besides Spark's in-built [monitoring capabilities](monitoring.html), +the progress of a Spark Streaming program can also be monitored using the [StreamingListener] +(streaming/index.html#org.apache.spark.scheduler.StreamingListener) interface, +which allows you to get statistics of batch processing times, queueing delays, +and total end-to-end delays. Note that this is still an experimental API and it is likely to be +improved upon (i.e., more information reported) in the future. ## Memory Tuning -Tuning the memory usage and GC behavior of Spark applications have been discussed in great detail in the [Tuning Guide](tuning.html). It is recommended that you read that. In this section, we highlight a few customizations that are strongly recommended to minimize GC related pauses in Spark Streaming applications and achieving more consistent batch processing times. - -* **Default persistence level of DStreams**: Unlike RDDs, the default persistence level of DStreams serializes the data in memory (that is, [StorageLevel.MEMORY_ONLY_SER](api/core/index.html#org.apache.spark.storage.StorageLevel$) for DStream compared to [StorageLevel.MEMORY_ONLY](api/core/index.html#org.apache.spark.storage.StorageLevel$) for RDDs). Even though keeping the data serialized incurs a higher serialization overheads, it significantly reduces GC pauses. - -* **Concurrent garbage collector**: Using the concurrent mark-and-sweep GC further minimizes the variability of GC pauses. Even though concurrent GC is known to reduce the overall processing throughput of the system, its use is still recommended to achieve more consistent batch processing times. +Tuning the memory usage and GC behavior of Spark applications have been discussed in great detail +in the [Tuning Guide](tuning.html). It is recommended that you read that. In this section, +we highlight a few customizations that are strongly recommended to minimize GC related pauses +in Spark Streaming applications and achieving more consistent batch processing times. + +* **Default persistence level of DStreams**: Unlike RDDs, the default persistence level of DStreams +serializes the data in memory (that is, +[StorageLevel.MEMORY_ONLY_SER](api/core/index.html#org.apache.spark.storage.StorageLevel$) for +DStream compared to +[StorageLevel.MEMORY_ONLY](api/core/index.html#org.apache.spark.storage.StorageLevel$) for RDDs). +Even though keeping the data serialized incurs higher serialization/deserialization overheads, +it significantly reduces GC pauses. + +* **Clearing persistent RDDs**: By default, all persistent RDDs generated by Spark Streaming will + be cleared from memory based on Spark's in-built policy (LRU). If `spark.cleaner.ttl` is set, + then persistent RDDs that are older than that value are periodically cleared. As mentioned + [earlier](#operation), this needs to be careful set based on operations used in the Spark + Streaming program. However, a smarter unpersisting of RDDs can be enabled by setting the + [configuration property](configuration.html#spark-properties) `spark.streaming.unpersist` to + `true`. This makes the system to figure out which RDDs are not necessary to be kept around and + unpersists them. This is likely to reduce + the RDD memory usage of Spark, potentially improving GC behavior as well. + +* **Concurrent garbage collector**: Using the concurrent mark-and-sweep GC further +minimizes the variability of GC pauses. Even though concurrent GC is known to reduce the +overall processing throughput of the system, its use is still recommended to achieve more +consistent batch processing times. + +*************************************************************************************************** # Fault-tolerance Properties -In this section, we are going to discuss the behavior of Spark Streaming application in the event of a node failure. To understand this, let us remember the basic fault-tolerance properties of Spark's RDDs. +In this section, we are going to discuss the behavior of Spark Streaming application in the event +of a node failure. To understand this, let us remember the basic fault-tolerance properties of +Spark's RDDs. - 1. An RDD is an immutable, and deterministically re-computable, distributed dataset. Each RDD remembers the lineage of deterministic operations that were used on a fault-tolerant input dataset to create it. - 1. If any partition of an RDD is lost due to a worker node failure, then that partition can be re-computed from the original fault-tolerant dataset using the lineage of operations. + 1. An RDD is an immutable, deterministically re-computable, distributed dataset. Each RDD + remembers the lineage of deterministic operations that were used on a fault-tolerant input + dataset to create it. + 1. If any partition of an RDD is lost due to a worker node failure, then that partition can be + re-computed from the original fault-tolerant dataset using the lineage of operations. -Since all data transformations in Spark Streaming are based on RDD operations, as long as the input dataset is present, all intermediate data can recomputed. Keeping these properties in mind, we are going to discuss the failure semantics in more detail. +Since all data transformations in Spark Streaming are based on RDD operations, as long as the input +dataset is present, all intermediate data can recomputed. Keeping these properties in mind, we are +going to discuss the failure semantics in more detail. ## Failure of a Worker Node - There are two failure behaviors based on which input sources are used. -1. _Using HDFS files as input source_ - Since the data is reliably stored on HDFS, all data can re-computed and therefore no data will be lost due to any failure. -1. _Using any input source that receives data through a network_ - For network-based data sources like Kafka and Flume, the received input data is replicated in memory between nodes of the cluster (default replication factor is 2). So if a worker node fails, then the system can recompute the lost from the the left over copy of the input data. However, if the worker node where a network receiver was running fails, then a tiny bit of data may be lost, that is, the data received by the system but not yet replicated to other node(s). The receiver will be started on a different node and it will continue to receive data. - -Since all data is modeled as RDDs with their lineage of deterministic operations, any recomputation always leads to the same result. As a result, all DStream transformations are guaranteed to have _exactly-once_ semantics. That is, the final transformed result will be same even if there were was a worker node failure. However, output operations (like `foreachRDD`) have _at-least once_ semantics, that is, the transformed data may get written to an external entity more than once in the event of a worker failure. While this is acceptable for saving to HDFS using the `saveAs*Files` operations (as the file will simply get over-written by the same data), additional transactions-like mechanisms may be necessary to achieve exactly-once semantics for output operations. +1. _Using HDFS files as input source_ - Since the data is reliably stored on HDFS, all data can +re-computed and therefore no data will be lost due to any failure. +1. _Using any input source that receives data through a network_ - For network-based data sources +like Kafka and Flume, the received input data is replicated in memory between nodes of the cluster +(default replication factor is 2). So if a worker node fails, then the system can recompute the +lost from the the left over copy of the input data. However, if the worker node where a network +receiver was running fails, then a tiny bit of data may be lost, that is, the data received by +the system but not yet replicated to other node(s). The receiver will be started on a different +node and it will continue to receive data. + +Since all data is modeled as RDDs with their lineage of deterministic operations, any recomputation + always leads to the same result. As a result, all DStream transformations are guaranteed to have + _exactly-once_ semantics. That is, the final transformed result will be same even if there were + was a worker node failure. However, output operations (like `foreachRDD`) have _at-least once_ + semantics, that is, the transformed data may get written to an external entity more than once in + the event of a worker failure. While this is acceptable for saving to HDFS using the + `saveAs*Files` operations (as the file will simply get over-written by the same data), + additional transactions-like mechanisms may be necessary to achieve exactly-once semantics + for output operations. ## Failure of the Driver Node -A system that is required to operate 24/7 needs to be able tolerate the failure of the driver node as well. Spark Streaming does this by saving the state of the DStream computation periodically to a HDFS file, that can be used to restart the streaming computation in the event of a failure of the driver node. This checkpointing is enabled by setting a HDFS directory for checkpointing using `ssc.checkpoint()` as described [earlier](#rdd-checkpointing-within-dstreams). To elaborate, the following state is periodically saved to a file. +To allows a streaming application to operate 24/7, Spark Streaming allows a streaming computation +to be resumed even after the failure of the driver node. Spark Streaming periodically writes the +metadata information of the DStreams setup through the `StreamingContext` to a +HDFS directory (can be any Hadoop-compatible filesystem). This periodic +*checkpointing* can be enabled by setting a the checkpoint +directory using `ssc.checkpoint()` as described +[earlier](#rdd-checkpointing). On failure of the driver node, +the lost `StreamingContext` can be recovered from this information, and restarted. + +To allow a Spark Streaming program to be recoverable, it must be written in a way such that +it has the following behavior: + +1. When the program is being started for the first time, it will create a new StreamingContext, + set up all the streams and then call start(). +1. When the program is being restarted after failure, it will re-create a StreamingContext + from the checkpoint data in the checkpoint directory. -1. The DStream operator graph (input streams, output streams, etc.) -1. The configuration of each DStream (checkpoint interval, etc.) -1. The RDD checkpoint files of each DStream +
    +
    -All this is periodically saved in the checkpoint directory. To recover, a new `StreamingContext` can be created with this directory by using +This behavior is made simple by using `StreamingContext.getOrCreate`. This is used as follows. {% highlight scala %} -val ssc = new StreamingContext(checkpointDirectory) +// Function to create and setup a new StreamingContext +def functionToCreateContext(): StreamingContext = { + val ssc = new StreamingContext(...) // new context + val lines = ssc.socketTextStream(...) // create DStreams + ... + ssc.checkpoint(checkpointDirectory) // set checkpoint directory + ssc +} + +// Get StreaminContext from checkpoint data or create a new one +val context = StreamingContext.getOrCreate(checkpointDirectory, functionToCreateContext _) + +// Do additional setup on context that needs to be done, +// irrespective of whether it is being started or restarted +context. ... + +// Start the context +context.start() +context.awaitTermination() {% endhighlight %} -On calling `ssc.start()` on this new context, the following steps are taken by the system +If the `checkpointDirectory` exists, then the context will be recreated from the checkpoint data. +If the directory does not exist (i.e., running for the first time), +then the function `functionToCreateContext` will be called to create a new +context and set up the DStreams. See the Scala example +[RecoverableNetworkWordCount]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/streaming/examples/RecoverableNetworkWordCount.scala). +This example appends the word counts of network data into a file. -1. Schedule the transformations and output operations for all the time steps between the time when the driver failed and when it last checkpointed. This is also done for those time steps that were previously scheduled but not processed due to the failure. This will make the system recompute all the intermediate data from the checkpointed RDD files, etc. -1. Restart the network receivers, if any, and continue receiving new data. +You can also explicitly create a `StreamingContext` from the checkpoint data and start the + computation by using `new StreamingContext(checkpointDirectory)`. -There are two different failure behaviors based on which input sources are used. +
    +
    -1. _Using HDFS files as input source_ - Since the data is reliably stored on HDFS, all data can re-computed and therefore no data will be lost due to any failure. -1. _Using any input source that receives data through a network_ - The received input data is replicated in memory to multiple nodes. Since, all the data in the Spark worker's memory is lost when the Spark driver fails, the past input data will not be accessible and driver recovers. Hence, if stateful and window-based operations are used (like `updateStateByKey`, `window`, `countByValueAndWindow`, etc.), then the intermediate state will not be recovered completely. +This behavior is made simple by using `JavaStreamingContext.getOrCreate`. This is used as follows. -In future releases, we will support full recoverability for all input sources. Note that for non-stateful transformations like `map`, `count`, and `reduceByKey`, with _all_ input streams, the system, upon restarting, will continue to receive and process new data. +{% highlight java %} +// Create a factory object that can create a and setup a new JavaStreamingContext +JavaStreamingContextFactory contextFactory = new JavaStreamingContextFactory() { + JavaStreamingContextFactory create() { + JavaStreamingContext jssc = new JavaStreamingContext(...); // new context + JavaDStream lines = jssc.socketTextStream(...); // create DStreams + ... + jssc.checkpoint(checkpointDirectory); // set checkpoint directory + return jssc; + } +}; + +// Get JavaStreamingContext from checkpoint data or create a new one +JavaStreamingContext context = JavaStreamingContext.getOrCreate(checkpointDirectory, contextFactory); + +// Do additional setup on context that needs to be done, +// irrespective of whether it is being started or restarted +context. ... + +// Start the context +context.start(); +context.awaitTermination(); +{% endhighlight %} -To better understand the behavior of the system under driver failure with a HDFS source, lets consider what will happen with a file input stream Specifically, in the case of the file input stream, it will correctly identify new files that were created while the driver was down and process them in the same way as it would have if the driver had not failed. To explain further in the case of file input stream, we shall use an example. Lets say, files are being generated every second, and a Spark Streaming program reads every new file and output the number of lines in the file. This is what the sequence of outputs would be with and without a driver failure. +If the `checkpointDirectory` exists, then the context will be recreated from the checkpoint data. +If the directory does not exist (i.e., running for the first time), +then the function `contextFactory` will be called to create a new +context and set up the DStreams. See the Scala example +[JavaRecoverableWordCount]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/streaming/examples/JavaRecoverableWordCount.scala) +(note that this example is missing in the 0.9 release, so you can test it using the master branch). +This example appends the word counts of network data into a file. + +You can also explicitly create a `JavaStreamingContext` from the checkpoint data and start +the computation by using `new JavaStreamingContext(checkpointDirectory)`. + +
    +
    + +**Note**: If Spark Streaming and/or the Spark Streaming program is recompiled, +you *must* create a new `StreamingContext` or `JavaStreamingContext`, +not recreate from checkpoint data. This is because trying to load a +context from checkpoint data may fail if the data was generated before recompilation of the +classes. So, if you are using `getOrCreate`, then make sure that the checkpoint directory is +explicitly deleted every time recompiled code needs to be launched. + +This failure recovery can be done automatically using Spark's +[standalone cluster mode](spark-standalone.html), which allows any Spark +application's driver to be as well as, ensures automatic restart of the driver on failure (see +[supervise mode](spark-standalone.html#launching-applications-inside-the-cluster)). This can be +tested locally by launching the above example using the supervise mode in a +local standalone cluster and killing the java process running the driver (will be shown as +*DriverWrapper* when `jps` is run to show all active Java processes). The driver should be +automatically restarted, and the word counts will cont + +For other deployment environments like Mesos and Yarn, you have to restart the driver through other +mechanisms. + +

    Recovery Semantics

    + +There are two different failure behaviors based on which input sources are used. + +1. _Using HDFS files as input source_ - Since the data is reliably stored on HDFS, all data can +re-computed and therefore no data will be lost due to any failure. +1. _Using any input source that receives data through a network_ - The received input data is +replicated in memory to multiple nodes. Since, all the data in the Spark worker's memory is lost +when the Spark driver fails, the past input data will not be accessible and driver recovers. +Hence, if stateful and window-based operations are used +(like `updateStateByKey`, `window`, `countByValueAndWindow`, etc.), then the intermediate state +will not be recovered completely. + +In future releases, we will support full recoverability for all input sources. Note that for +non-stateful transformations like `map`, `count`, and `reduceByKey`, with _all_ input streams, +the system, upon restarting, will continue to receive and process new data. + +To better understand the behavior of the system under driver failure with a HDFS source, lets +consider what will happen with a file input stream. Specifically, in the case of the file input +stream, it will correctly identify new files that were created while the driver was down and +process them in the same way as it would have if the driver had not failed. To explain further +in the case of file input stream, we shall use an example. Lets say, files are being generated +every second, and a Spark Streaming program reads every new file and output the number of lines +in the file. This is what the sequence of outputs would be with and without a driver failure. @@ -476,58 +1211,21 @@ To better understand the behavior of the system under driver failure with a HDFS
    -If the driver had crashed in the middle of the processing of time 3, then it will process time 3 and output 30 after recovery. - -# Java API - -Similar to [Spark's Java API](java-programming-guide.html), we also provide a Java API for Spark Streaming which allows all its features to be accessible from a Java program. This is defined in [org.apache.spark.streaming.api.java] (api/streaming/index.html#org.apache.spark.streaming.api.java.package) package and includes [JavaStreamingContext](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaStreamingContext) and [JavaDStream](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaDStream) classes that provide the same methods as their Scala counterparts, but take Java functions (that is, Function, and Function2) and return Java data and collection types. Some of the key points to note are: - -1. Functions for transformations must be implemented as subclasses of [Function](api/core/index.html#org.apache.spark.api.java.function.Function) and [Function2](api/core/index.html#org.apache.spark.api.java.function.Function2) -1. Unlike the Scala API, the Java API handles DStreams for key-value pairs using a separate [JavaPairDStream](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaPairDStream) class(similar to [JavaRDD and JavaPairRDD](java-programming-guide.html#rdd-classes). DStream functions like `map` and `filter` are implemented separately by JavaDStreams and JavaPairDStream to return DStreams of appropriate types. - -Spark's [Java Programming Guide](java-programming-guide.html) gives more ideas about using the Java API. To extends the ideas presented for the RDDs to DStreams, we present parts of the Java version of the same NetworkWordCount example presented above. The full source code is given at `/examples/src/main/java/org/apache/spark/streaming/examples/JavaNetworkWordCount.java` - -The streaming context and the socket stream from input source is started by using a `JavaStreamingContext`, that has the same parameters and provides the same input streams as its Scala counterpart. - -{% highlight java %} -JavaStreamingContext ssc = new JavaStreamingContext(mesosUrl, "NetworkWordCount", Seconds(1)); -JavaDStream lines = ssc.socketTextStream(ip, port); -{% endhighlight %} - - -Then the `lines` are split into words by using the `flatMap` function and [FlatMapFunction](api/core/index.html#org.apache.spark.api.java.function.FlatMapFunction). - -{% highlight java %} -JavaDStream words = lines.flatMap( - new FlatMapFunction() { - @Override - public Iterable call(String x) { - return Lists.newArrayList(x.split(" ")); - } - }); -{% endhighlight %} - -The `words` is then mapped to a [JavaPairDStream](api/streaming/index.html#org.apache.spark.streaming.api.java.JavaPairDStream) of `(word, 1)` pairs using `map` and [PairFunction](api/core/index.html#org.apache.spark.api.java.function.PairFunction). This is reduced by using `reduceByKey` and [Function2](api/core/index.html#org.apache.spark.api.java.function.Function2). - -{% highlight java %} -JavaPairDStream wordCounts = words.map( - new PairFunction() { - @Override - public Tuple2 call(String s) throws Exception { - return new Tuple2(s, 1); - } - }).reduceByKey( - new Function2() { - @Override - public Integer call(Integer i1, Integer i2) throws Exception { - return i1 + i2; - } - }); -{% endhighlight %} - +If the driver had crashed in the middle of the processing of time 3, then it will process time 3 +and output 30 after recovery. # Where to Go from Here -* API docs - [Scala](api/streaming/index.html#org.apache.spark.streaming.package) and [Java](api/streaming/index.html#org.apache.spark.streaming.api.java.package) -* More examples - [Scala](https://github.com/apache/incubator-spark/tree/master/examples/src/main/scala/org/apache/spark/streaming/examples) and [Java](https://github.com/apache/incubator-spark/tree/master/examples/src/main/java/org/apache/spark/streaming/examples) -* [Paper describing Spark Streaming](http://www.eecs.berkeley.edu/Pubs/TechRpts/2012/EECS-2012-259.pdf) +* API documentation + - Main docs of StreamingContext and DStreams in [Scala](api/streaming/index.html#org.apache.spark.streaming.package) + and [Java](api/streaming/index.html#org.apache.spark.streaming.api.java.package) + - Additional docs for + [Kafka](api/external/kafka/index.html#org.apache.spark.streaming.kafka.KafkaUtils$), + [Flume](api/external/flume/index.html#org.apache.spark.streaming.flume.FlumeUtils$), + [Twitter](api/external/twitter/index.html#org.apache.spark.streaming.twitter.TwitterUtils$), + [ZeroMQ](api/external/zeromq/index.html#org.apache.spark.streaming.zeromq.ZeroMQUtils$), and + [MQTT](api/external/mqtt/index.html#org.apache.spark.streaming.mqtt.MQTTUtils$) + +* More examples in [Scala]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/scala/org/apache/spark/streaming/examples) + and [Java]({{site.SPARK_GITHUB_URL}}/tree/master/examples/src/main/java/org/apache/spark/streaming/examples) +* [Paper](http://www.eecs.berkeley.edu/Pubs/TechRpts/2012/EECS-2012-259.pdf) describing Spark Streaming diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py index e7cb5ab3ff9b0..e88f80aa62627 100755 --- a/ec2/spark_ec2.py +++ b/ec2/spark_ec2.py @@ -70,7 +70,7 @@ def parse_args(): "slaves across multiple (an additional $0.01/Gb for bandwidth" + "between zones applies)") parser.add_option("-a", "--ami", help="Amazon Machine Image ID to use") - parser.add_option("-v", "--spark-version", default="0.8.0", + parser.add_option("-v", "--spark-version", default="0.9.0", help="Version of Spark to use: 'X.Y.Z' or a specific git hash") parser.add_option("--spark-git-repo", default="https://github.com/apache/incubator-spark", @@ -157,7 +157,7 @@ def is_active(instance): # Return correct versions of Spark and Shark, given the supplied Spark version def get_spark_shark_version(opts): - spark_shark_map = {"0.7.3": "0.7.1", "0.8.0": "0.8.0"} + spark_shark_map = {"0.7.3": "0.7.1", "0.8.0": "0.8.0", "0.8.1": "0.8.1", "0.9.0": "0.9.0"} version = opts.spark_version.replace("v", "") if version not in spark_shark_map: print >> stderr, "Don't know about Spark version: %s" % version @@ -189,7 +189,12 @@ def get_spark_ami(opts): "i2.xlarge": "hvm", "i2.2xlarge": "hvm", "i2.4xlarge": "hvm", - "i2.8xlarge": "hvm" + "i2.8xlarge": "hvm", + "c3.large": "pvm", + "c3.xlarge": "pvm", + "c3.2xlarge": "pvm", + "c3.4xlarge": "pvm", + "c3.8xlarge": "pvm" } if opts.instance_type in instance_types: instance_type = instance_types[opts.instance_type] @@ -486,7 +491,12 @@ def get_num_disks(instance_type): "i2.xlarge": 1, "i2.2xlarge": 2, "i2.4xlarge": 4, - "i2.8xlarge": 8 + "i2.8xlarge": 8, + "c3.large": 2, + "c3.xlarge": 2, + "c3.2xlarge": 2, + "c3.4xlarge": 2, + "c3.8xlarge": 2 } if instance_type in disks_by_instance: return disks_by_instance[instance_type] diff --git a/examples/pom.xml b/examples/pom.xml index cb4f7ee33b4a1..967556744c1e6 100644 --- a/examples/pom.xml +++ b/examples/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 0.9.1-incubating-SNAPSHOT ../pom.xml @@ -71,6 +71,12 @@ ${project.version} provided + + org.apache.spark + spark-graphx_${scala.binary.version} + ${project.version} + provided + org.apache.spark spark-streaming-twitter_${scala.binary.version} diff --git a/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java b/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java index 9eb1cadd71d22..a933f483c710f 100644 --- a/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java +++ b/examples/src/main/java/org/apache/spark/examples/JavaLogQuery.java @@ -124,7 +124,7 @@ public Stats call(Stats stats, Stats stats2) { List, Stats>> output = counts.collect(); for (Tuple2 t : output) { - System.out.println(t._1 + "\t" + t._2); + System.out.println(t._1() + "\t" + t._2()); } System.exit(0); } diff --git a/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java b/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java index a84245b0c7449..cdcc2c9e67cf9 100644 --- a/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java +++ b/examples/src/main/java/org/apache/spark/examples/JavaPageRank.java @@ -90,8 +90,8 @@ public Double call(List rs) { @Override public Iterable> call(Tuple2, Double> s) { List> results = new ArrayList>(); - for (String n : s._1) { - results.add(new Tuple2(n, s._2 / s._1.size())); + for (String n : s._1()) { + results.add(new Tuple2(n, s._2() / s._1().size())); } return results; } @@ -109,7 +109,7 @@ public Double call(Double sum) { // Collects all URL ranks and dump them to console. List> output = ranks.collect(); for (Tuple2 tuple : output) { - System.out.println(tuple._1 + " has rank: " + tuple._2 + "."); + System.out.println(tuple._1() + " has rank: " + tuple._2() + "."); } System.exit(0); diff --git a/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java b/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java index 3ec4a58d48ed6..ac8df02c4630b 100644 --- a/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java +++ b/examples/src/main/java/org/apache/spark/examples/JavaSparkPi.java @@ -30,11 +30,11 @@ public final class JavaSparkPi { public static void main(String[] args) throws Exception { if (args.length == 0) { - System.err.println("Usage: JavaLogQuery [slices]"); + System.err.println("Usage: JavaSparkPi [slices]"); System.exit(1); } - JavaSparkContext jsc = new JavaSparkContext(args[0], "JavaLogQuery", + JavaSparkContext jsc = new JavaSparkContext(args[0], "JavaSparkPi", System.getenv("SPARK_HOME"), JavaSparkContext.jarOfClass(JavaSparkPi.class)); int slices = (args.length == 2) ? Integer.parseInt(args[1]) : 2; diff --git a/examples/src/main/java/org/apache/spark/examples/JavaWordCount.java b/examples/src/main/java/org/apache/spark/examples/JavaWordCount.java index 6651f98d56711..95a2f3ab5290d 100644 --- a/examples/src/main/java/org/apache/spark/examples/JavaWordCount.java +++ b/examples/src/main/java/org/apache/spark/examples/JavaWordCount.java @@ -65,7 +65,7 @@ public Integer call(Integer i1, Integer i2) { List> output = counts.collect(); for (Tuple2 tuple : output) { - System.out.println(tuple._1 + ": " + tuple._2); + System.out.println(tuple._1() + ": " + tuple._2()); } System.exit(0); } diff --git a/examples/src/main/java/org/apache/spark/streaming/examples/JavaFlumeEventCount.java b/examples/src/main/java/org/apache/spark/streaming/examples/JavaFlumeEventCount.java index 7b5a243e26414..f061001dd264d 100644 --- a/examples/src/main/java/org/apache/spark/streaming/examples/JavaFlumeEventCount.java +++ b/examples/src/main/java/org/apache/spark/streaming/examples/JavaFlumeEventCount.java @@ -70,5 +70,6 @@ public String call(Long in) { }).print(); ssc.start(); + ssc.awaitTermination(); } } diff --git a/examples/src/main/java/org/apache/spark/streaming/examples/JavaKafkaWordCount.java b/examples/src/main/java/org/apache/spark/streaming/examples/JavaKafkaWordCount.java index 04f62ee204145..2ffd351b4e498 100644 --- a/examples/src/main/java/org/apache/spark/streaming/examples/JavaKafkaWordCount.java +++ b/examples/src/main/java/org/apache/spark/streaming/examples/JavaKafkaWordCount.java @@ -104,5 +104,6 @@ public Integer call(Integer i1, Integer i2) { wordCounts.print(); jssc.start(); + jssc.awaitTermination(); } } diff --git a/examples/src/main/java/org/apache/spark/streaming/examples/JavaNetworkWordCount.java b/examples/src/main/java/org/apache/spark/streaming/examples/JavaNetworkWordCount.java index 349d826ab5df7..7777c9832abd3 100644 --- a/examples/src/main/java/org/apache/spark/streaming/examples/JavaNetworkWordCount.java +++ b/examples/src/main/java/org/apache/spark/streaming/examples/JavaNetworkWordCount.java @@ -84,5 +84,6 @@ public Integer call(Integer i1, Integer i2) { wordCounts.print(); ssc.start(); + ssc.awaitTermination(); } } diff --git a/examples/src/main/java/org/apache/spark/streaming/examples/JavaQueueStream.java b/examples/src/main/java/org/apache/spark/streaming/examples/JavaQueueStream.java index 7ef9c6c8f4aaf..26c44620abec1 100644 --- a/examples/src/main/java/org/apache/spark/streaming/examples/JavaQueueStream.java +++ b/examples/src/main/java/org/apache/spark/streaming/examples/JavaQueueStream.java @@ -58,10 +58,9 @@ public static void main(String[] args) throws Exception { } for (int i = 0; i < 30; i++) { - rddQueue.add(ssc.sc().parallelize(list)); + rddQueue.add(ssc.sparkContext().parallelize(list)); } - // Create the QueueInputDStream and use it do some processing JavaDStream inputStream = ssc.queueStream(rddQueue); JavaPairDStream mappedStream = inputStream.map( @@ -81,5 +80,6 @@ public Integer call(Integer i1, Integer i2) { reducedStream.print(); ssc.start(); + ssc.awaitTermination(); } } diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/ActorWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/ActorWordCount.scala index 57e1b1f806e82..a5888811cc5ea 100644 --- a/examples/src/main/scala/org/apache/spark/streaming/examples/ActorWordCount.scala +++ b/examples/src/main/scala/org/apache/spark/streaming/examples/ActorWordCount.scala @@ -88,7 +88,7 @@ extends Actor with Receiver { override def preStart = remotePublisher ! SubscribeReceiver(context.self) def receive = { - case msg ā‡’ context.parent ! pushBlock(msg.asInstanceOf[T]) + case msg ā‡’ pushBlock(msg.asInstanceOf[T]) } override def postStop() = remotePublisher ! UnsubscribeReceiver(context.self) @@ -171,5 +171,6 @@ object ActorWordCount { lines.flatMap(_.split("\\s+")).map(x => (x, 1)).reduceByKey(_ + _).print() ssc.start() + ssc.awaitTermination() } } diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/FlumeEventCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/FlumeEventCount.scala index a59be7899dd37..11c3aaad3c8a8 100644 --- a/examples/src/main/scala/org/apache/spark/streaming/examples/FlumeEventCount.scala +++ b/examples/src/main/scala/org/apache/spark/streaming/examples/FlumeEventCount.scala @@ -60,5 +60,6 @@ object FlumeEventCount { stream.count().map(cnt => "Received " + cnt + " flume events." ).print() ssc.start() + ssc.awaitTermination() } } diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/HdfsWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/HdfsWordCount.scala index 704b315ef8b22..954bcc9b6ef5d 100644 --- a/examples/src/main/scala/org/apache/spark/streaming/examples/HdfsWordCount.scala +++ b/examples/src/main/scala/org/apache/spark/streaming/examples/HdfsWordCount.scala @@ -50,6 +50,7 @@ object HdfsWordCount { val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() + ssc.awaitTermination() } } diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/KafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/KafkaWordCount.scala index 4a3d81c09a122..d9cb7326bb97d 100644 --- a/examples/src/main/scala/org/apache/spark/streaming/examples/KafkaWordCount.scala +++ b/examples/src/main/scala/org/apache/spark/streaming/examples/KafkaWordCount.scala @@ -61,6 +61,7 @@ object KafkaWordCount { wordCounts.print() ssc.start() + ssc.awaitTermination() } } diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/MQTTWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/MQTTWordCount.scala index 78b49fdcf1eb3..eb61caf8c85b9 100644 --- a/examples/src/main/scala/org/apache/spark/streaming/examples/MQTTWordCount.scala +++ b/examples/src/main/scala/org/apache/spark/streaming/examples/MQTTWordCount.scala @@ -101,5 +101,6 @@ object MQTTWordCount { val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() + ssc.awaitTermination() } } diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala index 02264757123db..5656d487a57cc 100644 --- a/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala +++ b/examples/src/main/scala/org/apache/spark/streaming/examples/NetworkWordCount.scala @@ -54,5 +54,6 @@ object NetworkWordCount { val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() + ssc.awaitTermination() } } diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/RawNetworkGrep.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/RawNetworkGrep.scala index 99b79c3949a4e..cdd7547d0d3b4 100644 --- a/examples/src/main/scala/org/apache/spark/streaming/examples/RawNetworkGrep.scala +++ b/examples/src/main/scala/org/apache/spark/streaming/examples/RawNetworkGrep.scala @@ -61,5 +61,6 @@ object RawNetworkGrep { union.filter(_.contains("the")).count().foreachRDD(r => println("Grep count: " + r.collect().mkString)) ssc.start() + ssc.awaitTermination() } } diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/RecoverableNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/RecoverableNetworkWordCount.scala index 8c5d0bd56845b..aa82bf3c6bd8e 100644 --- a/examples/src/main/scala/org/apache/spark/streaming/examples/RecoverableNetworkWordCount.scala +++ b/examples/src/main/scala/org/apache/spark/streaming/examples/RecoverableNetworkWordCount.scala @@ -114,5 +114,6 @@ object RecoverableNetworkWordCount { createContext(master, ip, port, outputPath) }) ssc.start() + ssc.awaitTermination() } } diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/StatefulNetworkWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/StatefulNetworkWordCount.scala index 1183eba84686b..88f1cef89b318 100644 --- a/examples/src/main/scala/org/apache/spark/streaming/examples/StatefulNetworkWordCount.scala +++ b/examples/src/main/scala/org/apache/spark/streaming/examples/StatefulNetworkWordCount.scala @@ -65,5 +65,6 @@ object StatefulNetworkWordCount { val stateDstream = wordDstream.updateStateByKey[Int](updateFunc) stateDstream.print() ssc.start() + ssc.awaitTermination() } } diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/StreamingExamples.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/StreamingExamples.scala index d41d84a980dc7..99f1502046f53 100644 --- a/examples/src/main/scala/org/apache/spark/streaming/examples/StreamingExamples.scala +++ b/examples/src/main/scala/org/apache/spark/streaming/examples/StreamingExamples.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.streaming.examples import org.apache.spark.Logging diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdCMS.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdCMS.scala index 483c4d311810f..bbd44948b6fa5 100644 --- a/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdCMS.scala +++ b/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdCMS.scala @@ -110,5 +110,6 @@ object TwitterAlgebirdCMS { }) ssc.start() + ssc.awaitTermination() } } diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdHLL.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdHLL.scala index 94c2bf29ac433..a0094d460feec 100644 --- a/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdHLL.scala +++ b/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterAlgebirdHLL.scala @@ -87,5 +87,6 @@ object TwitterAlgebirdHLL { }) ssc.start() + ssc.awaitTermination() } } diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterPopularTags.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterPopularTags.scala index 8a70d4a978cd4..896d010c68f18 100644 --- a/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterPopularTags.scala +++ b/examples/src/main/scala/org/apache/spark/streaming/examples/TwitterPopularTags.scala @@ -69,5 +69,6 @@ object TwitterPopularTags { }) ssc.start() + ssc.awaitTermination() } } diff --git a/examples/src/main/scala/org/apache/spark/streaming/examples/ZeroMQWordCount.scala b/examples/src/main/scala/org/apache/spark/streaming/examples/ZeroMQWordCount.scala index 12d2a1084f900..85b4ce5e81950 100644 --- a/examples/src/main/scala/org/apache/spark/streaming/examples/ZeroMQWordCount.scala +++ b/examples/src/main/scala/org/apache/spark/streaming/examples/ZeroMQWordCount.scala @@ -91,5 +91,6 @@ object ZeroMQWordCount { val wordCounts = words.map(x => (x, 1)).reduceByKey(_ + _) wordCounts.print() ssc.start() + ssc.awaitTermination() } } diff --git a/external/flume/pom.xml b/external/flume/pom.xml index 443910a03a94e..978b99f4a7054 100644 --- a/external/flume/pom.xml +++ b/external/flume/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 0.9.1-incubating-SNAPSHOT ../../pom.xml diff --git a/external/flume/src/test/resources/log4j.properties b/external/flume/src/test/resources/log4j.properties index 063529a9cbc67..d1bd73a8430e1 100644 --- a/external/flume/src/test/resources/log4j.properties +++ b/external/flume/src/test/resources/log4j.properties @@ -20,7 +20,7 @@ log4j.rootCategory=INFO, file # log4j.appender.file=org.apache.log4j.FileAppender log4j.appender.file=org.apache.log4j.FileAppender log4j.appender.file.append=false -log4j.appender.file.file=streaming/target/unit-tests.log +log4j.appender.file.file=external/flume/target/unit-tests.log log4j.appender.file.layout=org.apache.log4j.PatternLayout log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml index 23b2fead657e6..a3d5fc64f070e 100644 --- a/external/kafka/pom.xml +++ b/external/kafka/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 0.9.1-incubating-SNAPSHOT ../../pom.xml diff --git a/external/kafka/src/test/resources/log4j.properties b/external/kafka/src/test/resources/log4j.properties index 063529a9cbc67..38910d113050a 100644 --- a/external/kafka/src/test/resources/log4j.properties +++ b/external/kafka/src/test/resources/log4j.properties @@ -20,7 +20,7 @@ log4j.rootCategory=INFO, file # log4j.appender.file=org.apache.log4j.FileAppender log4j.appender.file=org.apache.log4j.FileAppender log4j.appender.file.append=false -log4j.appender.file.file=streaming/target/unit-tests.log +log4j.appender.file.file=external/kafka/target/unit-tests.log log4j.appender.file.layout=org.apache.log4j.PatternLayout log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n diff --git a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala index 9c81f23c19118..d9809f6409d44 100644 --- a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala +++ b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/KafkaStreamSuite.scala @@ -35,5 +35,6 @@ class KafkaStreamSuite extends TestSuiteBase { ssc, kafkaParams, topics, StorageLevel.MEMORY_AND_DISK_SER_2) // TODO: Actually test receiving data + ssc.stop() } } diff --git a/external/mqtt/pom.xml b/external/mqtt/pom.xml index 31b4fa87de772..1f416dd8c06d4 100644 --- a/external/mqtt/pom.xml +++ b/external/mqtt/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 0.9.1-incubating-SNAPSHOT ../../pom.xml diff --git a/external/mqtt/src/test/resources/log4j.properties b/external/mqtt/src/test/resources/log4j.properties index 063529a9cbc67..d0462c7336df5 100644 --- a/external/mqtt/src/test/resources/log4j.properties +++ b/external/mqtt/src/test/resources/log4j.properties @@ -20,7 +20,7 @@ log4j.rootCategory=INFO, file # log4j.appender.file=org.apache.log4j.FileAppender log4j.appender.file=org.apache.log4j.FileAppender log4j.appender.file.append=false -log4j.appender.file.file=streaming/target/unit-tests.log +log4j.appender.file.file=external/mqtt/target/unit-tests.log log4j.appender.file.layout=org.apache.log4j.PatternLayout log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n diff --git a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala index 73e7ce6e968c6..89c40ad4619c9 100644 --- a/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala +++ b/external/mqtt/src/test/scala/org/apache/spark/streaming/mqtt/MQTTStreamSuite.scala @@ -32,5 +32,6 @@ class MQTTStreamSuite extends TestSuiteBase { val test2 = MQTTUtils.createStream(ssc, brokerUrl, topic, StorageLevel.MEMORY_AND_DISK_SER_2) // TODO: Actually test receiving data + ssc.stop() } } diff --git a/external/twitter/pom.xml b/external/twitter/pom.xml index 216e6c1d8ff44..f23091684f95c 100644 --- a/external/twitter/pom.xml +++ b/external/twitter/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 0.9.1-incubating-SNAPSHOT ../../pom.xml diff --git a/external/twitter/src/test/resources/log4j.properties b/external/twitter/src/test/resources/log4j.properties index 063529a9cbc67..c918335fcdc70 100644 --- a/external/twitter/src/test/resources/log4j.properties +++ b/external/twitter/src/test/resources/log4j.properties @@ -20,7 +20,7 @@ log4j.rootCategory=INFO, file # log4j.appender.file=org.apache.log4j.FileAppender log4j.appender.file=org.apache.log4j.FileAppender log4j.appender.file.append=false -log4j.appender.file.file=streaming/target/unit-tests.log +log4j.appender.file.file=external/twitter/target/unit-tests.log log4j.appender.file.layout=org.apache.log4j.PatternLayout log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n diff --git a/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala b/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala index ccc38784ef671..06ab0cdaf3b4e 100644 --- a/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala +++ b/external/twitter/src/test/scala/org/apache/spark/streaming/twitter/TwitterStreamSuite.scala @@ -39,5 +39,6 @@ class TwitterStreamSuite extends TestSuiteBase { // Note that actually testing the data receiving is hard as authentication keys are // necessary for accessing Twitter live stream + ssc.stop() } } diff --git a/external/zeromq/pom.xml b/external/zeromq/pom.xml index c240d595742cf..6a250b3916ead 100644 --- a/external/zeromq/pom.xml +++ b/external/zeromq/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 0.9.1-incubating-SNAPSHOT ../../pom.xml diff --git a/external/zeromq/src/test/resources/log4j.properties b/external/zeromq/src/test/resources/log4j.properties index 063529a9cbc67..304683dd0bac3 100644 --- a/external/zeromq/src/test/resources/log4j.properties +++ b/external/zeromq/src/test/resources/log4j.properties @@ -20,7 +20,7 @@ log4j.rootCategory=INFO, file # log4j.appender.file=org.apache.log4j.FileAppender log4j.appender.file=org.apache.log4j.FileAppender log4j.appender.file.append=false -log4j.appender.file.file=streaming/target/unit-tests.log +log4j.appender.file.file=external/zeromq/target/unit-tests.log log4j.appender.file.layout=org.apache.log4j.PatternLayout log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %p %c{1}: %m%n diff --git a/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala b/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala index 4193b8a02f14a..92d55a7a7b6e4 100644 --- a/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala +++ b/external/zeromq/src/test/scala/org/apache/spark/streaming/zeromq/ZeroMQStreamSuite.scala @@ -40,5 +40,6 @@ class ZeroMQStreamSuite extends TestSuiteBase { StorageLevel.MEMORY_AND_DISK_SER_2, SupervisorStrategy.defaultStrategy) // TODO: Actually test data receiving + ssc.stop() } } diff --git a/extras/spark-ganglia-lgpl/pom.xml b/extras/spark-ganglia-lgpl/pom.xml new file mode 100644 index 0000000000000..e39b07dd2f15d --- /dev/null +++ b/extras/spark-ganglia-lgpl/pom.xml @@ -0,0 +1,45 @@ + + + + 4.0.0 + + org.apache.spark + spark-parent + 0.9.1-incubating-SNAPSHOT + ../../pom.xml + + + + org.apache.spark + spark-ganglia-lgpl_2.10 + jar + Spark Ganglia Integration + + + + org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + + + + com.codahale.metrics + metrics-ganglia + + + diff --git a/core/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala b/extras/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala similarity index 100% rename from core/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala rename to extras/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala diff --git a/graphx/pom.xml b/graphx/pom.xml index 3e5faf230dbc9..9073cf7aa66ec 100644 --- a/graphx/pom.xml +++ b/graphx/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 0.9.1-incubating-SNAPSHOT ../pom.xml @@ -36,7 +36,11 @@ org.apache.spark spark-core_${scala.binary.version} ${project.version} - provided + + + org.apache.commons + commons-math3 + 3.2 org.eclipse.jetty diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Edge.scala b/graphx/src/main/scala/org/apache/spark/graphx/Edge.scala index 738a38b27f0e4..580faa0866789 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/Edge.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/Edge.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx /** @@ -11,8 +28,8 @@ package org.apache.spark.graphx * @param attr The attribute associated with the edge */ case class Edge[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED] ( - var srcId: VertexID = 0, - var dstId: VertexID = 0, + var srcId: VertexId = 0, + var dstId: VertexId = 0, var attr: ED = null.asInstanceOf[ED]) extends Serializable { @@ -22,7 +39,7 @@ case class Edge[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED] * @param vid the id one of the two vertices on the edge. * @return the id of the other vertex on the edge. */ - def otherVertexId(vid: VertexID): VertexID = + def otherVertexId(vid: VertexId): VertexId = if (srcId == vid) dstId else { assert(dstId == vid); srcId } /** @@ -33,7 +50,7 @@ case class Edge[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED] * @return the relative direction of the edge to the corresponding * vertex. */ - def relativeDirection(vid: VertexID): EdgeDirection = + def relativeDirection(vid: VertexId): EdgeDirection = if (vid == srcId) EdgeDirection.Out else { assert(vid == dstId); EdgeDirection.In } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/EdgeDirection.scala b/graphx/src/main/scala/org/apache/spark/graphx/EdgeDirection.scala index f265764006234..6f03eb1439773 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/EdgeDirection.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/EdgeDirection.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx /** diff --git a/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala index 832b7816fe833..fe03ae4a629b9 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/EdgeRDD.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx import scala.reflect.{classTag, ClassTag} @@ -85,7 +102,7 @@ class EdgeRDD[@specialized ED: ClassTag]( */ def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) - (f: (VertexID, VertexID, ED, ED2) => ED3): EdgeRDD[ED3] = { + (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDD[ED3] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] new EdgeRDD[ED3](partitionsRDD.zipPartitions(other.partitionsRDD, true) { @@ -96,7 +113,7 @@ class EdgeRDD[@specialized ED: ClassTag]( }) } - private[graphx] def collectVertexIDs(): RDD[VertexID] = { + private[graphx] def collectVertexIds(): RDD[VertexId] = { partitionsRDD.flatMap { case (_, p) => Array.concat(p.srcIds, p.dstIds) } } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/EdgeTriplet.scala b/graphx/src/main/scala/org/apache/spark/graphx/EdgeTriplet.scala index 4253b24b5ac55..fea43c3b2bbf1 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/EdgeTriplet.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/EdgeTriplet.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx /** @@ -33,7 +50,7 @@ class EdgeTriplet[VD, ED] extends Edge[ED] { * @param vid the id one of the two vertices on the edge * @return the attribute for the other vertex on the edge */ - def otherVertexAttr(vid: VertexID): VD = + def otherVertexAttr(vid: VertexId): VD = if (srcId == vid) dstAttr else { assert(dstId == vid); srcAttr } /** @@ -42,7 +59,7 @@ class EdgeTriplet[VD, ED] extends Edge[ED] { * @param vid the id of one of the two vertices on the edge * @return the attr for the vertex with that id */ - def vertexAttr(vid: VertexID): VD = + def vertexAttr(vid: VertexId): VD = if (srcId == vid) srcAttr else { assert(dstId == vid); dstAttr } override def toString = ((srcId, srcAttr), (dstId, dstAttr), attr).toString() diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala index 9dd05ade0aef2..eea95d38d5016 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/Graph.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx import scala.reflect.ClassTag @@ -109,7 +126,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab * }}} * */ - def mapVertices[VD2: ClassTag](map: (VertexID, VD) => VD2): Graph[VD2, ED] + def mapVertices[VD2: ClassTag](map: (VertexId, VD) => VD2): Graph[VD2, ED] /** * Transforms each edge attribute in the graph using the map function. The map function is not @@ -225,7 +242,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab */ def subgraph( epred: EdgeTriplet[VD,ED] => Boolean = (x => true), - vpred: (VertexID, VD) => Boolean = ((v, d) => true)) + vpred: (VertexId, VD) => Boolean = ((v, d) => true)) : Graph[VD, ED] /** @@ -275,7 +292,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab * vertex * {{{ * val rawGraph: Graph[(),()] = Graph.textFile("twittergraph") - * val inDeg: RDD[(VertexID, Int)] = + * val inDeg: RDD[(VertexId, Int)] = * mapReduceTriplets[Int](et => Iterator((et.dst.id, 1)), _ + _) * }}} * @@ -287,7 +304,7 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab * */ def mapReduceTriplets[A: ClassTag]( - mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexID, A)], + mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)], reduceFunc: (A, A) => A, activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None) : VertexRDD[A] @@ -311,14 +328,14 @@ abstract class Graph[VD: ClassTag, ED: ClassTag] protected () extends Serializab * * {{{ * val rawGraph: Graph[_, _] = Graph.textFile("webgraph") - * val outDeg: RDD[(VertexID, Int)] = rawGraph.outDegrees() + * val outDeg: RDD[(VertexId, Int)] = rawGraph.outDegrees() * val graph = rawGraph.outerJoinVertices(outDeg) { * (vid, data, optDeg) => optDeg.getOrElse(0) * } * }}} */ - def outerJoinVertices[U: ClassTag, VD2: ClassTag](other: RDD[(VertexID, U)]) - (mapFunc: (VertexID, VD, Option[U]) => VD2) + def outerJoinVertices[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)]) + (mapFunc: (VertexId, VD, Option[U]) => VD2) : Graph[VD2, ED] /** @@ -347,7 +364,7 @@ object Graph { * (if `uniqueEdges` is `None`) and vertex attributes containing the total degree of each vertex. */ def fromEdgeTuples[VD: ClassTag]( - rawEdges: RDD[(VertexID, VertexID)], + rawEdges: RDD[(VertexId, VertexId)], defaultValue: VD, uniqueEdges: Option[PartitionStrategy] = None): Graph[VD, Int] = { @@ -388,7 +405,7 @@ object Graph { * mentioned in edges but not in vertices */ def apply[VD: ClassTag, ED: ClassTag]( - vertices: RDD[(VertexID, VD)], + vertices: RDD[(VertexId, VD)], edges: RDD[Edge[ED]], defaultVertexAttr: VD = null.asInstanceOf[VD]): Graph[VD, ED] = { GraphImpl(vertices, edges, defaultVertexAttr) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphKryoRegistrator.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphKryoRegistrator.scala index d79bdf961841b..dd380d8c182c9 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/GraphKryoRegistrator.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphKryoRegistrator.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx import com.esotericsoftware.kryo.Kryo @@ -16,7 +33,7 @@ class GraphKryoRegistrator extends KryoRegistrator { kryo.register(classOf[Edge[Object]]) kryo.register(classOf[MessageToPartition[Object]]) kryo.register(classOf[VertexBroadcastMsg[Object]]) - kryo.register(classOf[(VertexID, Object)]) + kryo.register(classOf[(VertexId, Object)]) kryo.register(classOf[EdgePartition[Object]]) kryo.register(classOf[BitSet]) kryo.register(classOf[VertexIdToIndexMap]) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala index 5904aa3a28c71..18858466db27b 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphLoader.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx import org.apache.spark.{Logging, SparkContext} diff --git a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala index f10e63f059aed..0fc1e4df6813c 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/GraphOps.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx import scala.reflect.ClassTag @@ -63,19 +80,19 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali * * @return the set of neighboring ids for each vertex */ - def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexID]] = { + def collectNeighborIds(edgeDirection: EdgeDirection): VertexRDD[Array[VertexId]] = { val nbrs = if (edgeDirection == EdgeDirection.Either) { - graph.mapReduceTriplets[Array[VertexID]]( + graph.mapReduceTriplets[Array[VertexId]]( mapFunc = et => Iterator((et.srcId, Array(et.dstId)), (et.dstId, Array(et.srcId))), reduceFunc = _ ++ _ ) } else if (edgeDirection == EdgeDirection.Out) { - graph.mapReduceTriplets[Array[VertexID]]( + graph.mapReduceTriplets[Array[VertexId]]( mapFunc = et => Iterator((et.srcId, Array(et.dstId))), reduceFunc = _ ++ _) } else if (edgeDirection == EdgeDirection.In) { - graph.mapReduceTriplets[Array[VertexID]]( + graph.mapReduceTriplets[Array[VertexId]]( mapFunc = et => Iterator((et.dstId, Array(et.srcId))), reduceFunc = _ ++ _) } else { @@ -83,7 +100,7 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali "direction. (EdgeDirection.Both is not supported; use EdgeDirection.Either instead.)") } graph.vertices.leftZipJoin(nbrs) { (vid, vdata, nbrsOpt) => - nbrsOpt.getOrElse(Array.empty[VertexID]) + nbrsOpt.getOrElse(Array.empty[VertexId]) } } // end of collectNeighborIds @@ -99,8 +116,8 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali * * @return the vertex set of neighboring vertex attributes for each vertex */ - def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[Array[(VertexID, VD)]] = { - val nbrs = graph.mapReduceTriplets[Array[(VertexID,VD)]]( + def collectNeighbors(edgeDirection: EdgeDirection): VertexRDD[Array[(VertexId, VD)]] = { + val nbrs = graph.mapReduceTriplets[Array[(VertexId,VD)]]( edge => { val msgToSrc = (edge.srcId, Array((edge.dstId, edge.dstAttr))) val msgToDst = (edge.dstId, Array((edge.srcId, edge.srcAttr))) @@ -116,7 +133,7 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali (a, b) => a ++ b) graph.vertices.leftZipJoin(nbrs) { (vid, vdata, nbrsOpt) => - nbrsOpt.getOrElse(Array.empty[(VertexID, VD)]) + nbrsOpt.getOrElse(Array.empty[(VertexId, VD)]) } } // end of collectNeighbor @@ -147,9 +164,9 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali * }}} * */ - def joinVertices[U: ClassTag](table: RDD[(VertexID, U)])(mapFunc: (VertexID, VD, U) => VD) + def joinVertices[U: ClassTag](table: RDD[(VertexId, U)])(mapFunc: (VertexId, VD, U) => VD) : Graph[VD, ED] = { - val uf = (id: VertexID, data: VD, o: Option[U]) => { + val uf = (id: VertexId, data: VD, o: Option[U]) => { o match { case Some(u) => mapFunc(id, data, u) case None => data @@ -180,7 +197,7 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali * val degrees: VertexRDD[Int] = graph.outDegrees * graph.outerJoinVertices(degrees) {(vid, data, deg) => deg.getOrElse(0)} * }, - * vpred = (vid: VertexID, deg:Int) => deg > 0 + * vpred = (vid: VertexId, deg:Int) => deg > 0 * ) * }}} * @@ -188,7 +205,7 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali def filter[VD2: ClassTag, ED2: ClassTag]( preprocess: Graph[VD, ED] => Graph[VD2, ED2], epred: (EdgeTriplet[VD2, ED2]) => Boolean = (x: EdgeTriplet[VD2, ED2]) => true, - vpred: (VertexID, VD2) => Boolean = (v:VertexID, d:VD2) => true): Graph[VD, ED] = { + vpred: (VertexId, VD2) => Boolean = (v:VertexId, d:VD2) => true): Graph[VD, ED] = { graph.mask(preprocess(graph).subgraph(epred, vpred)) } @@ -243,8 +260,8 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali initialMsg: A, maxIterations: Int = Int.MaxValue, activeDirection: EdgeDirection = EdgeDirection.Either)( - vprog: (VertexID, VD, A) => VD, - sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexID,A)], + vprog: (VertexId, VD, A) => VD, + sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexId,A)], mergeMsg: (A, A) => A) : Graph[VD, ED] = { Pregel(graph, initialMsg, maxIterations, activeDirection)(vprog, sendMsg, mergeMsg) @@ -276,7 +293,7 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali * * @see [[org.apache.spark.graphx.lib.ConnectedComponents$#run]] */ - def connectedComponents(): Graph[VertexID, ED] = { + def connectedComponents(): Graph[VertexId, ED] = { ConnectedComponents.run(graph) } @@ -295,7 +312,7 @@ class GraphOps[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]) extends Seriali * * @see [[org.apache.spark.graphx.lib.StronglyConnectedComponents$#run]] */ - def stronglyConnectedComponents(numIter: Int): Graph[VertexID, ED] = { + def stronglyConnectedComponents(numIter: Int): Graph[VertexId, ED] = { StronglyConnectedComponents.run(graph, numIter) } } // end of GraphOps diff --git a/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala b/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala index 6d2990a3f6642..929915362c1c9 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/PartitionStrategy.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx /** @@ -6,7 +23,7 @@ package org.apache.spark.graphx */ trait PartitionStrategy extends Serializable { /** Returns the partition number for a given edge. */ - def getPartition(src: VertexID, dst: VertexID, numParts: PartitionID): PartitionID + def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID } /** @@ -56,9 +73,9 @@ object PartitionStrategy { * is used. */ case object EdgePartition2D extends PartitionStrategy { - override def getPartition(src: VertexID, dst: VertexID, numParts: PartitionID): PartitionID = { + override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = { val ceilSqrtNumParts: PartitionID = math.ceil(math.sqrt(numParts)).toInt - val mixingPrime: VertexID = 1125899906842597L + val mixingPrime: VertexId = 1125899906842597L val col: PartitionID = ((math.abs(src) * mixingPrime) % ceilSqrtNumParts).toInt val row: PartitionID = ((math.abs(dst) * mixingPrime) % ceilSqrtNumParts).toInt (col * ceilSqrtNumParts + row) % numParts @@ -70,8 +87,8 @@ object PartitionStrategy { * source. */ case object EdgePartition1D extends PartitionStrategy { - override def getPartition(src: VertexID, dst: VertexID, numParts: PartitionID): PartitionID = { - val mixingPrime: VertexID = 1125899906842597L + override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = { + val mixingPrime: VertexId = 1125899906842597L (math.abs(src) * mixingPrime).toInt % numParts } } @@ -82,7 +99,7 @@ object PartitionStrategy { * random vertex cut that colocates all same-direction edges between two vertices. */ case object RandomVertexCut extends PartitionStrategy { - override def getPartition(src: VertexID, dst: VertexID, numParts: PartitionID): PartitionID = { + override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = { math.abs((src, dst).hashCode()) % numParts } } @@ -94,7 +111,7 @@ object PartitionStrategy { * regardless of direction. */ case object CanonicalRandomVertexCut extends PartitionStrategy { - override def getPartition(src: VertexID, dst: VertexID, numParts: PartitionID): PartitionID = { + override def getPartition(src: VertexId, dst: VertexId, numParts: PartitionID): PartitionID = { val lower = math.min(src, dst) val higher = math.max(src, dst) math.abs((lower, higher).hashCode()) % numParts diff --git a/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala b/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala index fc18f7e785a99..ac07a594a12e4 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/Pregel.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx import scala.reflect.ClassTag @@ -23,9 +40,9 @@ import scala.reflect.ClassTag * // Set the vertex attributes to the initial pagerank values * .mapVertices((id, attr) => 1.0) * - * def vertexProgram(id: VertexID, attr: Double, msgSum: Double): Double = + * def vertexProgram(id: VertexId, attr: Double, msgSum: Double): Double = * resetProb + (1.0 - resetProb) * msgSum - * def sendMessage(id: VertexID, edge: EdgeTriplet[Double, Double]): Iterator[(VertexId, Double)] = + * def sendMessage(id: VertexId, edge: EdgeTriplet[Double, Double]): Iterator[(VertexId, Double)] = * Iterator((edge.dstId, edge.srcAttr * edge.attr)) * def messageCombiner(a: Double, b: Double): Double = a + b * val initialMessage = 0.0 @@ -96,8 +113,8 @@ object Pregel { initialMsg: A, maxIterations: Int = Int.MaxValue, activeDirection: EdgeDirection = EdgeDirection.Either) - (vprog: (VertexID, VD, A) => VD, - sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexID, A)], + (vprog: (VertexId, VD, A) => VD, + sendMsg: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)], mergeMsg: (A, A) => A) : Graph[VD, ED] = { diff --git a/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala b/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala index 9a95364cb16dd..edd59bcf32943 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/VertexRDD.scala @@ -28,7 +28,7 @@ import org.apache.spark.graphx.impl.MsgRDDFunctions import org.apache.spark.graphx.impl.VertexPartition /** - * Extends `RDD[(VertexID, VD)]` by ensuring that there is only one entry for each vertex and by + * Extends `RDD[(VertexId, VD)]` by ensuring that there is only one entry for each vertex and by * pre-indexing the entries for fast, efficient joins. Two VertexRDDs with the same index can be * joined efficiently. All operations except [[reindex]] preserve the index. To construct a * `VertexRDD`, use the [[org.apache.spark.graphx.VertexRDD$ VertexRDD object]]. @@ -36,12 +36,12 @@ import org.apache.spark.graphx.impl.VertexPartition * @example Construct a `VertexRDD` from a plain RDD: * {{{ * // Construct an initial vertex set - * val someData: RDD[(VertexID, SomeType)] = loadData(someFile) + * val someData: RDD[(VertexId, SomeType)] = loadData(someFile) * val vset = VertexRDD(someData) * // If there were redundant values in someData we would use a reduceFunc * val vset2 = VertexRDD(someData, reduceFunc) * // Finally we can use the VertexRDD to index another dataset - * val otherData: RDD[(VertexID, OtherType)] = loadData(otherFile) + * val otherData: RDD[(VertexId, OtherType)] = loadData(otherFile) * val vset3 = vset2.innerJoin(otherData) { (vid, a, b) => b } * // Now we can construct very fast joins between the two sets * val vset4: VertexRDD[(SomeType, OtherType)] = vset.leftJoin(vset3) @@ -51,7 +51,7 @@ import org.apache.spark.graphx.impl.VertexPartition */ class VertexRDD[@specialized VD: ClassTag]( val partitionsRDD: RDD[VertexPartition[VD]]) - extends RDD[(VertexID, VD)](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { + extends RDD[(VertexId, VD)](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { require(partitionsRDD.partitioner.isDefined) @@ -92,9 +92,9 @@ class VertexRDD[@specialized VD: ClassTag]( } /** - * Provides the `RDD[(VertexID, VD)]` equivalent output. + * Provides the `RDD[(VertexId, VD)]` equivalent output. */ - override def compute(part: Partition, context: TaskContext): Iterator[(VertexID, VD)] = { + override def compute(part: Partition, context: TaskContext): Iterator[(VertexId, VD)] = { firstParent[VertexPartition[VD]].iterator(part, context).next.iterator } @@ -114,9 +114,9 @@ class VertexRDD[@specialized VD: ClassTag]( * rather than allocating new memory. * * @param pred the user defined predicate, which takes a tuple to conform to the - * `RDD[(VertexID, VD)]` interface + * `RDD[(VertexId, VD)]` interface */ - override def filter(pred: Tuple2[VertexID, VD] => Boolean): VertexRDD[VD] = + override def filter(pred: Tuple2[VertexId, VD] => Boolean): VertexRDD[VD] = this.mapVertexPartitions(_.filter(Function.untupled(pred))) /** @@ -140,7 +140,7 @@ class VertexRDD[@specialized VD: ClassTag]( * @return a new VertexRDD with values obtained by applying `f` to each of the entries in the * original VertexRDD. The resulting VertexRDD retains the same index. */ - def mapValues[VD2: ClassTag](f: (VertexID, VD) => VD2): VertexRDD[VD2] = + def mapValues[VD2: ClassTag](f: (VertexId, VD) => VD2): VertexRDD[VD2] = this.mapVertexPartitions(_.map(f)) /** @@ -172,7 +172,7 @@ class VertexRDD[@specialized VD: ClassTag]( * @return a VertexRDD containing the results of `f` */ def leftZipJoin[VD2: ClassTag, VD3: ClassTag] - (other: VertexRDD[VD2])(f: (VertexID, VD, Option[VD2]) => VD3): VertexRDD[VD3] = { + (other: VertexRDD[VD2])(f: (VertexId, VD, Option[VD2]) => VD3): VertexRDD[VD3] = { val newPartitionsRDD = partitionsRDD.zipPartitions( other.partitionsRDD, preservesPartitioning = true ) { (thisIter, otherIter) => @@ -200,8 +200,8 @@ class VertexRDD[@specialized VD: ClassTag]( * by `f`. */ def leftJoin[VD2: ClassTag, VD3: ClassTag] - (other: RDD[(VertexID, VD2)]) - (f: (VertexID, VD, Option[VD2]) => VD3) + (other: RDD[(VertexId, VD2)]) + (f: (VertexId, VD, Option[VD2]) => VD3) : VertexRDD[VD3] = { // Test if the other vertex is a VertexRDD to choose the optimal join strategy. // If the other set is a VertexRDD then we use the much more efficient leftZipJoin @@ -225,7 +225,7 @@ class VertexRDD[@specialized VD: ClassTag]( * [[innerJoin]] for the behavior of the join. */ def innerZipJoin[U: ClassTag, VD2: ClassTag](other: VertexRDD[U]) - (f: (VertexID, VD, U) => VD2): VertexRDD[VD2] = { + (f: (VertexId, VD, U) => VD2): VertexRDD[VD2] = { val newPartitionsRDD = partitionsRDD.zipPartitions( other.partitionsRDD, preservesPartitioning = true ) { (thisIter, otherIter) => @@ -247,8 +247,8 @@ class VertexRDD[@specialized VD: ClassTag]( * @return a VertexRDD co-indexed with `this`, containing only vertices that appear in both `this` * and `other`, with values supplied by `f` */ - def innerJoin[U: ClassTag, VD2: ClassTag](other: RDD[(VertexID, U)]) - (f: (VertexID, VD, U) => VD2): VertexRDD[VD2] = { + def innerJoin[U: ClassTag, VD2: ClassTag](other: RDD[(VertexId, U)]) + (f: (VertexId, VD, U) => VD2): VertexRDD[VD2] = { // Test if the other vertex is a VertexRDD to choose the optimal join strategy. // If the other set is a VertexRDD then we use the much more efficient innerZipJoin other match { @@ -278,7 +278,7 @@ class VertexRDD[@specialized VD: ClassTag]( * messages. */ def aggregateUsingIndex[VD2: ClassTag]( - messages: RDD[(VertexID, VD2)], reduceFunc: (VD2, VD2) => VD2): VertexRDD[VD2] = { + messages: RDD[(VertexId, VD2)], reduceFunc: (VD2, VD2) => VD2): VertexRDD[VD2] = { val shuffled = MsgRDDFunctions.partitionForAggregation(messages, this.partitioner.get) val parts = partitionsRDD.zipPartitions(shuffled, true) { (thisIter, msgIter) => val vertexPartition: VertexPartition[VD] = thisIter.next() @@ -303,8 +303,8 @@ object VertexRDD { * * @param rdd the collection of vertex-attribute pairs */ - def apply[VD: ClassTag](rdd: RDD[(VertexID, VD)]): VertexRDD[VD] = { - val partitioned: RDD[(VertexID, VD)] = rdd.partitioner match { + def apply[VD: ClassTag](rdd: RDD[(VertexId, VD)]): VertexRDD[VD] = { + val partitioned: RDD[(VertexId, VD)] = rdd.partitioner match { case Some(p) => rdd case None => rdd.partitionBy(new HashPartitioner(rdd.partitions.size)) } @@ -323,8 +323,8 @@ object VertexRDD { * @param rdd the collection of vertex-attribute pairs * @param mergeFunc the associative, commutative merge function. */ - def apply[VD: ClassTag](rdd: RDD[(VertexID, VD)], mergeFunc: (VD, VD) => VD): VertexRDD[VD] = { - val partitioned: RDD[(VertexID, VD)] = rdd.partitioner match { + def apply[VD: ClassTag](rdd: RDD[(VertexId, VD)], mergeFunc: (VD, VD) => VD): VertexRDD[VD] = { + val partitioned: RDD[(VertexId, VD)] = rdd.partitioner match { case Some(p) => rdd case None => rdd.partitionBy(new HashPartitioner(rdd.partitions.size)) } @@ -338,7 +338,7 @@ object VertexRDD { * Constructs a VertexRDD from the vertex IDs in `vids`, taking attributes from `rdd` and using * `defaultVal` otherwise. */ - def apply[VD: ClassTag](vids: RDD[VertexID], rdd: RDD[(VertexID, VD)], defaultVal: VD) + def apply[VD: ClassTag](vids: RDD[VertexId], rdd: RDD[(VertexId, VD)], defaultVal: VD) : VertexRDD[VD] = { VertexRDD(vids.map(vid => (vid, defaultVal))).leftJoin(rdd) { (vid, default, value) => value.getOrElse(default) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala index ee95ead3ada9b..57fa5eefd5e09 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartition.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.impl import scala.reflect.ClassTag @@ -17,10 +34,10 @@ import org.apache.spark.graphx.util.collection.PrimitiveKeyOpenHashMap */ private[graphx] class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) ED: ClassTag]( - val srcIds: Array[VertexID], - val dstIds: Array[VertexID], + val srcIds: Array[VertexId], + val dstIds: Array[VertexId], val data: Array[ED], - val index: PrimitiveKeyOpenHashMap[VertexID, Int]) extends Serializable { + val index: PrimitiveKeyOpenHashMap[VertexId, Int]) extends Serializable { /** * Reverse all the edges in this partition. @@ -101,8 +118,8 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) */ def groupEdges(merge: (ED, ED) => ED): EdgePartition[ED] = { val builder = new EdgePartitionBuilder[ED] - var currSrcId: VertexID = null.asInstanceOf[VertexID] - var currDstId: VertexID = null.asInstanceOf[VertexID] + var currSrcId: VertexId = null.asInstanceOf[VertexId] + var currDstId: VertexId = null.asInstanceOf[VertexId] var currAttr: ED = null.asInstanceOf[ED] var i = 0 while (i < size) { @@ -136,7 +153,7 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) */ def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgePartition[ED2]) - (f: (VertexID, VertexID, ED, ED2) => ED3): EdgePartition[ED3] = { + (f: (VertexId, VertexId, ED, ED2) => ED3): EdgePartition[ED3] = { val builder = new EdgePartitionBuilder[ED3] var i = 0 var j = 0 @@ -193,14 +210,14 @@ class EdgePartition[@specialized(Char, Int, Boolean, Byte, Long, Float, Double) * iterator is generated using an index scan, so it is efficient at skipping edges that don't * match srcIdPred. */ - def indexIterator(srcIdPred: VertexID => Boolean): Iterator[Edge[ED]] = + def indexIterator(srcIdPred: VertexId => Boolean): Iterator[Edge[ED]] = index.iterator.filter(kv => srcIdPred(kv._1)).flatMap(Function.tupled(clusterIterator)) /** * Get an iterator over the cluster of edges in this partition with source vertex id `srcId`. The * cluster must start at position `index`. */ - private def clusterIterator(srcId: VertexID, index: Int) = new Iterator[Edge[ED]] { + private def clusterIterator(srcId: VertexId, index: Int) = new Iterator[Edge[ED]] { private[this] val edge = new Edge[ED] private[this] var pos = index diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala index 9d072f933503c..63ccccb056b48 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgePartitionBuilder.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.impl import scala.reflect.ClassTag @@ -12,22 +29,22 @@ class EdgePartitionBuilder[@specialized(Long, Int, Double) ED: ClassTag](size: I var edges = new PrimitiveVector[Edge[ED]](size) /** Add a new edge to the partition. */ - def add(src: VertexID, dst: VertexID, d: ED) { + def add(src: VertexId, dst: VertexId, d: ED) { edges += Edge(src, dst, d) } def toEdgePartition: EdgePartition[ED] = { val edgeArray = edges.trim().array Sorting.quickSort(edgeArray)(Edge.lexicographicOrdering) - val srcIds = new Array[VertexID](edgeArray.size) - val dstIds = new Array[VertexID](edgeArray.size) + val srcIds = new Array[VertexId](edgeArray.size) + val dstIds = new Array[VertexId](edgeArray.size) val data = new Array[ED](edgeArray.size) - val index = new PrimitiveKeyOpenHashMap[VertexID, Int] + val index = new PrimitiveKeyOpenHashMap[VertexId, Int] // Copy edges into columnar structures, tracking the beginnings of source vertex id clusters and // adding them to the index if (edgeArray.length > 0) { index.update(srcIds(0), 0) - var currSrcId: VertexID = srcIds(0) + var currSrcId: VertexId = srcIds(0) var i = 0 while (i < edgeArray.size) { srcIds(i) = edgeArray(i).srcId diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeTripletIterator.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeTripletIterator.scala index bad840f1cdf36..886c250d7cffd 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeTripletIterator.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/EdgeTripletIterator.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.impl import scala.reflect.ClassTag @@ -24,7 +41,7 @@ class EdgeTripletIterator[VD: ClassTag, ED: ClassTag]( // allocating too many temporary Java objects. private val triplet = new EdgeTriplet[VD, ED] - private val vmap = new PrimitiveKeyOpenHashMap[VertexID, VD](vidToIndex, vertexArray) + private val vmap = new PrimitiveKeyOpenHashMap[VertexId, VD](vidToIndex, vertexArray) override def hasNext: Boolean = pos < edgePartition.size diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala index 56d1d9efeafa9..1d029bf009e8c 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/GraphImpl.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} @@ -88,7 +105,7 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected ( new GraphImpl(vertices, newETable, routingTable, replicatedVertexView) } - override def mapVertices[VD2: ClassTag](f: (VertexID, VD) => VD2): Graph[VD2, ED] = { + override def mapVertices[VD2: ClassTag](f: (VertexId, VD) => VD2): Graph[VD2, ED] = { if (classTag[VD] equals classTag[VD2]) { // The map preserves type, so we can use incremental replication val newVerts = vertices.mapVertexPartitions(_.map(f)).cache() @@ -136,7 +153,7 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected ( override def subgraph( epred: EdgeTriplet[VD, ED] => Boolean = x => true, - vpred: (VertexID, VD) => Boolean = (a, b) => true): Graph[VD, ED] = { + vpred: (VertexId, VD) => Boolean = (a, b) => true): Graph[VD, ED] = { // Filter the vertices, reusing the partitioner and the index from this graph val newVerts = vertices.mapVertexPartitions(_.filter(vpred)) @@ -178,7 +195,7 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected ( ////////////////////////////////////////////////////////////////////////////////////////////////// override def mapReduceTriplets[A: ClassTag]( - mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexID, A)], + mapFunc: EdgeTriplet[VD, ED] => Iterator[(VertexId, A)], reduceFunc: (A, A) => A, activeSetOpt: Option[(VertexRDD[_], EdgeDirection)] = None) = { @@ -208,7 +225,7 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected ( val edgeIter = activeDirectionOpt match { case Some(EdgeDirection.Both) => if (activeFraction < 0.8) { - edgePartition.indexIterator(srcVertexID => vPart.isActive(srcVertexID)) + edgePartition.indexIterator(srcVertexId => vPart.isActive(srcVertexId)) .filter(e => vPart.isActive(e.dstId)) } else { edgePartition.iterator.filter(e => vPart.isActive(e.srcId) && vPart.isActive(e.dstId)) @@ -219,7 +236,7 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected ( edgePartition.iterator.filter(e => vPart.isActive(e.srcId) || vPart.isActive(e.dstId)) case Some(EdgeDirection.Out) => if (activeFraction < 0.8) { - edgePartition.indexIterator(srcVertexID => vPart.isActive(srcVertexID)) + edgePartition.indexIterator(srcVertexId => vPart.isActive(srcVertexId)) } else { edgePartition.iterator.filter(e => vPart.isActive(e.srcId)) } @@ -250,8 +267,8 @@ class GraphImpl[VD: ClassTag, ED: ClassTag] protected ( } // end of mapReduceTriplets override def outerJoinVertices[U: ClassTag, VD2: ClassTag] - (other: RDD[(VertexID, U)]) - (updateF: (VertexID, VD, Option[U]) => VD2): Graph[VD2, ED] = + (other: RDD[(VertexId, U)]) + (updateF: (VertexId, VD, Option[U]) => VD2): Graph[VD2, ED] = { if (classTag[VD] equals classTag[VD2]) { // updateF preserves type, so we can use incremental replication @@ -295,7 +312,7 @@ object GraphImpl { } def apply[VD: ClassTag, ED: ClassTag]( - vertices: RDD[(VertexID, VD)], + vertices: RDD[(VertexId, VD)], edges: RDD[Edge[ED]], defaultVertexAttr: VD): GraphImpl[VD, ED] = { @@ -304,7 +321,7 @@ object GraphImpl { // Get the set of all vids val partitioner = Partitioner.defaultPartitioner(vertices) val vPartitioned = vertices.partitionBy(partitioner) - val vidsFromEdges = collectVertexIDsFromEdges(edgeRDD, partitioner) + val vidsFromEdges = collectVertexIdsFromEdges(edgeRDD, partitioner) val vids = vPartitioned.zipPartitions(vidsFromEdges) { (vertexIter, vidsFromEdgesIter) => vertexIter.map(_._1) ++ vidsFromEdgesIter.map(_._1) } @@ -338,7 +355,7 @@ object GraphImpl { /** * Create the edge RDD, which is much more efficient for Java heap storage than the normal edges - * data structure (RDD[(VertexID, VertexID, ED)]). + * data structure (RDD[(VertexId, VertexId, ED)]). * * The edge RDD contains multiple partitions, and each partition contains only one RDD key-value * pair: the key is the partition id, and the value is an EdgePartition object containing all the @@ -361,19 +378,19 @@ object GraphImpl { defaultVertexAttr: VD): GraphImpl[VD, ED] = { edges.cache() // Get the set of all vids - val vids = collectVertexIDsFromEdges(edges, new HashPartitioner(edges.partitions.size)) + val vids = collectVertexIdsFromEdges(edges, new HashPartitioner(edges.partitions.size)) // Create the VertexRDD. val vertices = VertexRDD(vids.mapValues(x => defaultVertexAttr)) GraphImpl(vertices, edges) } /** Collects all vids mentioned in edges and partitions them by partitioner. */ - private def collectVertexIDsFromEdges( + private def collectVertexIdsFromEdges( edges: EdgeRDD[_], - partitioner: Partitioner): RDD[(VertexID, Int)] = { + partitioner: Partitioner): RDD[(VertexId, Int)] = { // TODO: Consider doing map side distinct before shuffle. - new ShuffledRDD[VertexID, Int, (VertexID, Int)]( - edges.collectVertexIDs.map(vid => (vid, 0)), partitioner) - .setSerializer(classOf[VertexIDMsgSerializer].getName) + new ShuffledRDD[VertexId, Int, (VertexId, Int)]( + edges.collectVertexIds.map(vid => (vid, 0)), partitioner) + .setSerializer(classOf[VertexIdMsgSerializer].getName) } } // end of object GraphImpl diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/MessageToPartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/MessageToPartition.scala index 05508ff716eb1..e9ee09c3614c1 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/MessageToPartition.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/MessageToPartition.scala @@ -1,18 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.Partitioner -import org.apache.spark.graphx.{PartitionID, VertexID} +import org.apache.spark.graphx.{PartitionID, VertexId} import org.apache.spark.rdd.{ShuffledRDD, RDD} private[graphx] class VertexBroadcastMsg[@specialized(Int, Long, Double, Boolean) T]( @transient var partition: PartitionID, - var vid: VertexID, + var vid: VertexId, var data: T) - extends Product2[PartitionID, (VertexID, T)] with Serializable { + extends Product2[PartitionID, (VertexId, T)] with Serializable { override def _1 = partition @@ -44,7 +61,7 @@ class MessageToPartition[@specialized(Int, Long, Double, Char, Boolean/*, AnyRef private[graphx] class VertexBroadcastMsgRDDFunctions[T: ClassTag](self: RDD[VertexBroadcastMsg[T]]) { def partitionBy(partitioner: Partitioner): RDD[VertexBroadcastMsg[T]] = { - val rdd = new ShuffledRDD[PartitionID, (VertexID, T), VertexBroadcastMsg[T]](self, partitioner) + val rdd = new ShuffledRDD[PartitionID, (VertexId, T), VertexBroadcastMsg[T]](self, partitioner) // Set a custom serializer if the data is of int or double type. if (classTag[T] == ClassTag.Int) { @@ -82,8 +99,8 @@ object MsgRDDFunctions { new VertexBroadcastMsgRDDFunctions(rdd) } - def partitionForAggregation[T: ClassTag](msgs: RDD[(VertexID, T)], partitioner: Partitioner) = { - val rdd = new ShuffledRDD[VertexID, T, (VertexID, T)](msgs, partitioner) + def partitionForAggregation[T: ClassTag](msgs: RDD[(VertexId, T)], partitioner: Partitioner) = { + val rdd = new ShuffledRDD[VertexId, T, (VertexId, T)](msgs, partitioner) // Set a custom serializer if the data is of int or double type. if (classTag[T] == ClassTag.Int) { diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/ReplicatedVertexView.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/ReplicatedVertexView.scala index 4ebe0b02671d9..a8154b63ce5fb 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/ReplicatedVertexView.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/ReplicatedVertexView.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} @@ -33,9 +50,9 @@ class ReplicatedVertexView[VD: ClassTag]( * vids from both the source and destination of edges. It must always include both source and * destination vids because some operations, such as GraphImpl.mapReduceTriplets, rely on this. */ - private val localVertexIDMap: RDD[(Int, VertexIdToIndexMap)] = prevViewOpt match { + private val localVertexIdMap: RDD[(Int, VertexIdToIndexMap)] = prevViewOpt match { case Some(prevView) => - prevView.localVertexIDMap + prevView.localVertexIdMap case None => edges.partitionsRDD.mapPartitions(_.map { case (pid, epart) => @@ -45,7 +62,7 @@ class ReplicatedVertexView[VD: ClassTag]( vidToIndex.add(e.dstId) } (pid, vidToIndex) - }, preservesPartitioning = true).cache().setName("ReplicatedVertexView localVertexIDMap") + }, preservesPartitioning = true).cache().setName("ReplicatedVertexView localVertexIdMap") } private lazy val bothAttrs: RDD[(PartitionID, VertexPartition[VD])] = create(true, true) @@ -58,7 +75,7 @@ class ReplicatedVertexView[VD: ClassTag]( srcAttrOnly.unpersist(blocking) dstAttrOnly.unpersist(blocking) noAttrs.unpersist(blocking) - // Don't unpersist localVertexIDMap because a future ReplicatedVertexView may be using it + // Don't unpersist localVertexIdMap because a future ReplicatedVertexView may be using it // without modification this } @@ -116,8 +133,8 @@ class ReplicatedVertexView[VD: ClassTag]( case None => // Within each edge partition, place the shipped vertex attributes into the correct - // locations specified in localVertexIDMap - localVertexIDMap.zipPartitions(shippedVerts) { (mapIter, shippedVertsIter) => + // locations specified in localVertexIdMap + localVertexIdMap.zipPartitions(shippedVerts) { (mapIter, shippedVertsIter) => val (pid, vidToIndex) = mapIter.next() assert(!mapIter.hasNext) // Populate the vertex array using the vidToIndex map @@ -140,15 +157,15 @@ class ReplicatedVertexView[VD: ClassTag]( private object ReplicatedVertexView { protected def buildBuffer[VD: ClassTag]( - pid2vidIter: Iterator[Array[Array[VertexID]]], + pid2vidIter: Iterator[Array[Array[VertexId]]], vertexPartIter: Iterator[VertexPartition[VD]]) = { - val pid2vid: Array[Array[VertexID]] = pid2vidIter.next() + val pid2vid: Array[Array[VertexId]] = pid2vidIter.next() val vertexPart: VertexPartition[VD] = vertexPartIter.next() Iterator.tabulate(pid2vid.size) { pid => val vidsCandidate = pid2vid(pid) val size = vidsCandidate.length - val vids = new PrimitiveVector[VertexID](pid2vid(pid).size) + val vids = new PrimitiveVector[VertexId](pid2vid(pid).size) val attrs = new PrimitiveVector[VD](pid2vid(pid).size) var i = 0 while (i < size) { @@ -164,16 +181,16 @@ private object ReplicatedVertexView { } protected def buildActiveBuffer( - pid2vidIter: Iterator[Array[Array[VertexID]]], + pid2vidIter: Iterator[Array[Array[VertexId]]], activePartIter: Iterator[VertexPartition[_]]) - : Iterator[(Int, Array[VertexID])] = { - val pid2vid: Array[Array[VertexID]] = pid2vidIter.next() + : Iterator[(Int, Array[VertexId])] = { + val pid2vid: Array[Array[VertexId]] = pid2vidIter.next() val activePart: VertexPartition[_] = activePartIter.next() Iterator.tabulate(pid2vid.size) { pid => val vidsCandidate = pid2vid(pid) val size = vidsCandidate.length - val actives = new PrimitiveVector[VertexID](vidsCandidate.size) + val actives = new PrimitiveVector[VertexId](vidsCandidate.size) var i = 0 while (i < size) { val vid = vidsCandidate(i) @@ -188,8 +205,8 @@ private object ReplicatedVertexView { } private[graphx] -class VertexAttributeBlock[VD: ClassTag](val vids: Array[VertexID], val attrs: Array[VD]) +class VertexAttributeBlock[VD: ClassTag](val vids: Array[VertexId], val attrs: Array[VD]) extends Serializable { - def iterator: Iterator[(VertexID, VD)] = + def iterator: Iterator[(VertexId, VD)] = (0 until vids.size).iterator.map { i => (vids(i), attrs(i)) } } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTable.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTable.scala index f342fd7437903..fe44e1ee0c391 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTable.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/RoutingTable.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.impl import org.apache.spark.SparkContext._ @@ -15,12 +32,12 @@ import org.apache.spark.util.collection.PrimitiveVector private[impl] class RoutingTable(edges: EdgeRDD[_], vertices: VertexRDD[_]) { - val bothAttrs: RDD[Array[Array[VertexID]]] = createPid2Vid(true, true) - val srcAttrOnly: RDD[Array[Array[VertexID]]] = createPid2Vid(true, false) - val dstAttrOnly: RDD[Array[Array[VertexID]]] = createPid2Vid(false, true) - val noAttrs: RDD[Array[Array[VertexID]]] = createPid2Vid(false, false) + val bothAttrs: RDD[Array[Array[VertexId]]] = createPid2Vid(true, true) + val srcAttrOnly: RDD[Array[Array[VertexId]]] = createPid2Vid(true, false) + val dstAttrOnly: RDD[Array[Array[VertexId]]] = createPid2Vid(false, true) + val noAttrs: RDD[Array[Array[VertexId]]] = createPid2Vid(false, false) - def get(includeSrcAttr: Boolean, includeDstAttr: Boolean): RDD[Array[Array[VertexID]]] = + def get(includeSrcAttr: Boolean, includeDstAttr: Boolean): RDD[Array[Array[VertexId]]] = (includeSrcAttr, includeDstAttr) match { case (true, true) => bothAttrs case (true, false) => srcAttrOnly @@ -29,9 +46,9 @@ class RoutingTable(edges: EdgeRDD[_], vertices: VertexRDD[_]) { } private def createPid2Vid( - includeSrcAttr: Boolean, includeDstAttr: Boolean): RDD[Array[Array[VertexID]]] = { + includeSrcAttr: Boolean, includeDstAttr: Boolean): RDD[Array[Array[VertexId]]] = { // Determine which vertices each edge partition needs by creating a mapping from vid to pid. - val vid2pid: RDD[(VertexID, PartitionID)] = edges.partitionsRDD.mapPartitions { iter => + val vid2pid: RDD[(VertexId, PartitionID)] = edges.partitionsRDD.mapPartitions { iter => val (pid: PartitionID, edgePartition: EdgePartition[_]) = iter.next() val numEdges = edgePartition.size val vSet = new VertexSet @@ -54,7 +71,7 @@ class RoutingTable(edges: EdgeRDD[_], vertices: VertexRDD[_]) { val numPartitions = vertices.partitions.size vid2pid.partitionBy(vertices.partitioner.get).mapPartitions { iter => - val pid2vid = Array.fill(numPartitions)(new PrimitiveVector[VertexID]) + val pid2vid = Array.fill(numPartitions)(new PrimitiveVector[VertexId]) for ((vid, pid) <- iter) { pid2vid(pid) += vid } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/Serializers.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/Serializers.scala index cbd6318f33cdc..c74d487e206db 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/Serializers.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/Serializers.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.impl import java.io.{EOFException, InputStream, OutputStream} @@ -8,12 +25,12 @@ import org.apache.spark.graphx._ import org.apache.spark.serializer._ private[graphx] -class VertexIDMsgSerializer(conf: SparkConf) extends Serializer { +class VertexIdMsgSerializer(conf: SparkConf) extends Serializer { override def newInstance(): SerializerInstance = new ShuffleSerializerInstance { override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) { def writeObject[T](t: T) = { - val msg = t.asInstanceOf[(VertexID, _)] + val msg = t.asInstanceOf[(VertexId, _)] writeVarLong(msg._1, optimizePositive = false) this } @@ -106,7 +123,7 @@ class IntAggMsgSerializer(conf: SparkConf) extends Serializer { override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) { def writeObject[T](t: T) = { - val msg = t.asInstanceOf[(VertexID, Int)] + val msg = t.asInstanceOf[(VertexId, Int)] writeVarLong(msg._1, optimizePositive = false) writeUnsignedVarInt(msg._2) this @@ -130,7 +147,7 @@ class LongAggMsgSerializer(conf: SparkConf) extends Serializer { override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) { def writeObject[T](t: T) = { - val msg = t.asInstanceOf[(VertexID, Long)] + val msg = t.asInstanceOf[(VertexId, Long)] writeVarLong(msg._1, optimizePositive = false) writeVarLong(msg._2, optimizePositive = true) this @@ -154,7 +171,7 @@ class DoubleAggMsgSerializer(conf: SparkConf) extends Serializer { override def serializeStream(s: OutputStream) = new ShuffleSerializationStream(s) { def writeObject[T](t: T) = { - val msg = t.asInstanceOf[(VertexID, Double)] + val msg = t.asInstanceOf[(VertexId, Double)] writeVarLong(msg._1, optimizePositive = false) writeDouble(msg._2) this diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala index f97ff75fb2f93..7a54b413dc8ca 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/VertexPartition.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.impl import scala.reflect.ClassTag @@ -9,18 +26,18 @@ import org.apache.spark.util.collection.BitSet private[graphx] object VertexPartition { - def apply[VD: ClassTag](iter: Iterator[(VertexID, VD)]): VertexPartition[VD] = { - val map = new PrimitiveKeyOpenHashMap[VertexID, VD] + def apply[VD: ClassTag](iter: Iterator[(VertexId, VD)]): VertexPartition[VD] = { + val map = new PrimitiveKeyOpenHashMap[VertexId, VD] iter.foreach { case (k, v) => map(k) = v } new VertexPartition(map.keySet, map._values, map.keySet.getBitSet) } - def apply[VD: ClassTag](iter: Iterator[(VertexID, VD)], mergeFunc: (VD, VD) => VD) + def apply[VD: ClassTag](iter: Iterator[(VertexId, VD)], mergeFunc: (VD, VD) => VD) : VertexPartition[VD] = { - val map = new PrimitiveKeyOpenHashMap[VertexID, VD] + val map = new PrimitiveKeyOpenHashMap[VertexId, VD] iter.foreach { case (k, v) => map.setMerge(k, v, mergeFunc) } @@ -43,15 +60,15 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag]( def size: Int = mask.cardinality() /** Return the vertex attribute for the given vertex ID. */ - def apply(vid: VertexID): VD = values(index.getPos(vid)) + def apply(vid: VertexId): VD = values(index.getPos(vid)) - def isDefined(vid: VertexID): Boolean = { + def isDefined(vid: VertexId): Boolean = { val pos = index.getPos(vid) pos >= 0 && mask.get(pos) } /** Look up vid in activeSet, throwing an exception if it is None. */ - def isActive(vid: VertexID): Boolean = { + def isActive(vid: VertexId): Boolean = { activeSet.get.contains(vid) } @@ -71,7 +88,7 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag]( * each of the entries in the original VertexRDD. The resulting * VertexPartition retains the same index. */ - def map[VD2: ClassTag](f: (VertexID, VD) => VD2): VertexPartition[VD2] = { + def map[VD2: ClassTag](f: (VertexId, VD) => VD2): VertexPartition[VD2] = { // Construct a view of the map transformation val newValues = new Array[VD2](capacity) var i = mask.nextSetBit(0) @@ -91,7 +108,7 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag]( * RDD can be easily joined with the original vertex-set. Furthermore, the filter only * modifies the bitmap index and so no new values are allocated. */ - def filter(pred: (VertexID, VD) => Boolean): VertexPartition[VD] = { + def filter(pred: (VertexId, VD) => Boolean): VertexPartition[VD] = { // Allocate the array to store the results into val newMask = new BitSet(capacity) // Iterate over the active bits in the old mask and evaluate the predicate @@ -129,7 +146,7 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag]( /** Left outer join another VertexPartition. */ def leftJoin[VD2: ClassTag, VD3: ClassTag] (other: VertexPartition[VD2]) - (f: (VertexID, VD, Option[VD2]) => VD3): VertexPartition[VD3] = { + (f: (VertexId, VD, Option[VD2]) => VD3): VertexPartition[VD3] = { if (index != other.index) { logWarning("Joining two VertexPartitions with different indexes is slow.") leftJoin(createUsingIndex(other.iterator))(f) @@ -148,14 +165,14 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag]( /** Left outer join another iterator of messages. */ def leftJoin[VD2: ClassTag, VD3: ClassTag] - (other: Iterator[(VertexID, VD2)]) - (f: (VertexID, VD, Option[VD2]) => VD3): VertexPartition[VD3] = { + (other: Iterator[(VertexId, VD2)]) + (f: (VertexId, VD, Option[VD2]) => VD3): VertexPartition[VD3] = { leftJoin(createUsingIndex(other))(f) } /** Inner join another VertexPartition. */ def innerJoin[U: ClassTag, VD2: ClassTag](other: VertexPartition[U]) - (f: (VertexID, VD, U) => VD2): VertexPartition[VD2] = { + (f: (VertexId, VD, U) => VD2): VertexPartition[VD2] = { if (index != other.index) { logWarning("Joining two VertexPartitions with different indexes is slow.") innerJoin(createUsingIndex(other.iterator))(f) @@ -175,15 +192,15 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag]( * Inner join an iterator of messages. */ def innerJoin[U: ClassTag, VD2: ClassTag] - (iter: Iterator[Product2[VertexID, U]]) - (f: (VertexID, VD, U) => VD2): VertexPartition[VD2] = { + (iter: Iterator[Product2[VertexId, U]]) + (f: (VertexId, VD, U) => VD2): VertexPartition[VD2] = { innerJoin(createUsingIndex(iter))(f) } /** * Similar effect as aggregateUsingIndex((a, b) => a) */ - def createUsingIndex[VD2: ClassTag](iter: Iterator[Product2[VertexID, VD2]]) + def createUsingIndex[VD2: ClassTag](iter: Iterator[Product2[VertexId, VD2]]) : VertexPartition[VD2] = { val newMask = new BitSet(capacity) val newValues = new Array[VD2](capacity) @@ -201,7 +218,7 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag]( * Similar to innerJoin, but vertices from the left side that don't appear in iter will remain in * the partition, hidden by the bitmask. */ - def innerJoinKeepLeft(iter: Iterator[Product2[VertexID, VD]]): VertexPartition[VD] = { + def innerJoinKeepLeft(iter: Iterator[Product2[VertexId, VD]]): VertexPartition[VD] = { val newMask = new BitSet(capacity) val newValues = new Array[VD](capacity) System.arraycopy(values, 0, newValues, 0, newValues.length) @@ -216,7 +233,7 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag]( } def aggregateUsingIndex[VD2: ClassTag]( - iter: Iterator[Product2[VertexID, VD2]], + iter: Iterator[Product2[VertexId, VD2]], reduceFunc: (VD2, VD2) => VD2): VertexPartition[VD2] = { val newMask = new BitSet(capacity) val newValues = new Array[VD2](capacity) @@ -236,7 +253,7 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag]( new VertexPartition[VD2](index, newValues, newMask) } - def replaceActives(iter: Iterator[VertexID]): VertexPartition[VD] = { + def replaceActives(iter: Iterator[VertexId]): VertexPartition[VD] = { val newActiveSet = new VertexSet iter.foreach(newActiveSet.add(_)) new VertexPartition(index, values, mask, Some(newActiveSet)) @@ -246,7 +263,7 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag]( * Construct a new VertexPartition whose index contains only the vertices in the mask. */ def reindex(): VertexPartition[VD] = { - val hashMap = new PrimitiveKeyOpenHashMap[VertexID, VD] + val hashMap = new PrimitiveKeyOpenHashMap[VertexId, VD] val arbitraryMerge = (a: VD, b: VD) => a for ((k, v) <- this.iterator) { hashMap.setMerge(k, v, arbitraryMerge) @@ -254,8 +271,8 @@ class VertexPartition[@specialized(Long, Int, Double) VD: ClassTag]( new VertexPartition(hashMap.keySet, hashMap._values, hashMap.keySet.getBitSet) } - def iterator: Iterator[(VertexID, VD)] = + def iterator: Iterator[(VertexId, VD)] = mask.iterator.map(ind => (index.getValue(ind), values(ind))) - def vidIterator: Iterator[VertexID] = mask.iterator.map(ind => index.getValue(ind)) + def vidIterator: Iterator[VertexId] = mask.iterator.map(ind => index.getValue(ind)) } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/impl/package.scala b/graphx/src/main/scala/org/apache/spark/graphx/impl/package.scala index cfc3281b6407e..79549fe060457 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/impl/package.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/impl/package.scala @@ -1,7 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx import org.apache.spark.util.collection.OpenHashSet package object impl { - private[graphx] type VertexIdToIndexMap = OpenHashSet[VertexID] + private[graphx] type VertexIdToIndexMap = OpenHashSet[VertexId] } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/Analytics.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/Analytics.scala index e0aff5644e40d..f914e0565ca21 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/Analytics.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/Analytics.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.lib import org.apache.spark._ diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/ConnectedComponents.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/ConnectedComponents.scala index 4d1f5e74df59f..e2f6cc138958e 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/ConnectedComponents.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/ConnectedComponents.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.lib import scala.reflect.ClassTag @@ -18,9 +35,9 @@ object ConnectedComponents { * @return a graph with vertex attributes containing the smallest vertex in each * connected component */ - def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Graph[VertexID, ED] = { + def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Graph[VertexId, ED] = { val ccGraph = graph.mapVertices { case (vid, _) => vid } - def sendMessage(edge: EdgeTriplet[VertexID, ED]) = { + def sendMessage(edge: EdgeTriplet[VertexId, ED]) = { if (edge.srcAttr < edge.dstAttr) { Iterator((edge.dstId, edge.srcAttr)) } else if (edge.srcAttr > edge.dstAttr) { diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala index 2f4d6d686499a..614555a054dfb 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/PageRank.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.lib import scala.reflect.ClassTag @@ -75,7 +92,7 @@ object PageRank extends Logging { // Define the three functions needed to implement PageRank in the GraphX // version of Pregel - def vertexProgram(id: VertexID, attr: Double, msgSum: Double): Double = + def vertexProgram(id: VertexId, attr: Double, msgSum: Double): Double = resetProb + (1.0 - resetProb) * msgSum def sendMessage(edge: EdgeTriplet[Double, Double]) = Iterator((edge.dstId, edge.srcAttr * edge.attr)) @@ -120,7 +137,7 @@ object PageRank extends Logging { // Define the three functions needed to implement PageRank in the GraphX // version of Pregel - def vertexProgram(id: VertexID, attr: (Double, Double), msgSum: Double): (Double, Double) = { + def vertexProgram(id: VertexId, attr: (Double, Double), msgSum: Double): (Double, Double) = { val (oldPR, lastDelta) = attr val newPR = oldPR + (1.0 - resetProb) * msgSum (newPR, newPR - oldPR) diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala index ba6517e012d28..79280f836f21d 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/SVDPlusPlus.scala @@ -1,7 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.lib import scala.util.Random -import org.apache.commons.math.linear._ +import org.apache.commons.math3.linear._ import org.apache.spark.rdd._ import org.apache.spark.graphx._ @@ -62,13 +79,13 @@ object SVDPlusPlus { (g1: (Long, Double), g2: (Long, Double)) => (g1._1 + g2._1, g1._2 + g2._2)) g = g.outerJoinVertices(t0) { - (vid: VertexID, vd: (RealVector, RealVector, Double, Double), msg: Option[(Long, Double)]) => + (vid: VertexId, vd: (RealVector, RealVector, Double, Double), msg: Option[(Long, Double)]) => (vd._1, vd._2, msg.get._2 / msg.get._1, 1.0 / scala.math.sqrt(msg.get._1)) } def mapTrainF(conf: Conf, u: Double) (et: EdgeTriplet[(RealVector, RealVector, Double, Double), Double]) - : Iterator[(VertexID, (RealVector, RealVector, Double))] = { + : Iterator[(VertexId, (RealVector, RealVector, Double))] = { val (usr, itm) = (et.srcAttr, et.dstAttr) val (p, q) = (usr._1, itm._1) var pred = u + usr._3 + itm._3 + q.dotProduct(usr._2) @@ -95,7 +112,7 @@ object SVDPlusPlus { et => Iterator((et.srcId, et.dstAttr._2)), (g1: RealVector, g2: RealVector) => g1.add(g2)) g = g.outerJoinVertices(t1) { - (vid: VertexID, vd: (RealVector, RealVector, Double, Double), msg: Option[RealVector]) => + (vid: VertexId, vd: (RealVector, RealVector, Double, Double), msg: Option[RealVector]) => if (msg.isDefined) (vd._1, vd._1.add(msg.get.mapMultiply(vd._4)), vd._3, vd._4) else vd } @@ -106,7 +123,7 @@ object SVDPlusPlus { (g1: (RealVector, RealVector, Double), g2: (RealVector, RealVector, Double)) => (g1._1.add(g2._1), g1._2.add(g2._2), g1._3 + g2._3)) g = g.outerJoinVertices(t2) { - (vid: VertexID, + (vid: VertexId, vd: (RealVector, RealVector, Double, Double), msg: Option[(RealVector, RealVector, Double)]) => (vd._1.add(msg.get._1), vd._2.add(msg.get._2), vd._3 + msg.get._3, vd._4) @@ -116,7 +133,7 @@ object SVDPlusPlus { // calculate error on training set def mapTestF(conf: Conf, u: Double) (et: EdgeTriplet[(RealVector, RealVector, Double, Double), Double]) - : Iterator[(VertexID, Double)] = + : Iterator[(VertexId, Double)] = { val (usr, itm) = (et.srcAttr, et.dstAttr) val (p, q) = (usr._1, itm._1) @@ -129,7 +146,7 @@ object SVDPlusPlus { g.cache() val t3 = g.mapReduceTriplets(mapTestF(conf, u), (g1: Double, g2: Double) => g1 + g2) g = g.outerJoinVertices(t3) { - (vid: VertexID, vd: (RealVector, RealVector, Double, Double), msg: Option[Double]) => + (vid: VertexId, vd: (RealVector, RealVector, Double, Double), msg: Option[Double]) => if (msg.isDefined) (vd._1, vd._2, vd._3, msg.get) else vd } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/StronglyConnectedComponents.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/StronglyConnectedComponents.scala index d3d496e335481..46da38eeb725a 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/StronglyConnectedComponents.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/StronglyConnectedComponents.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.lib import scala.reflect.ClassTag @@ -18,7 +35,7 @@ object StronglyConnectedComponents { * * @return a graph with vertex attributes containing the smallest vertex id in each SCC */ - def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], numIter: Int): Graph[VertexID, ED] = { + def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], numIter: Int): Graph[VertexId, ED] = { // the graph we update with final SCC ids, and the graph we return at the end var sccGraph = graph.mapVertices { case (vid, _) => vid } @@ -54,7 +71,7 @@ object StronglyConnectedComponents { // collect min of all my neighbor's scc values, update if it's smaller than mine // then notify any neighbors with scc values larger than mine - sccWorkGraph = Pregel[(VertexID, Boolean), ED, VertexID]( + sccWorkGraph = Pregel[(VertexId, Boolean), ED, VertexId]( sccWorkGraph, Long.MaxValue, activeDirection = EdgeDirection.Out)( (vid, myScc, neighborScc) => (math.min(myScc._1, neighborScc), myScc._2), e => { @@ -68,7 +85,7 @@ object StronglyConnectedComponents { // start at root of SCCs. Traverse values in reverse, notify all my neighbors // do not propagate if colors do not match! - sccWorkGraph = Pregel[(VertexID, Boolean), ED, Boolean]( + sccWorkGraph = Pregel[(VertexId, Boolean), ED, Boolean]( sccWorkGraph, false, activeDirection = EdgeDirection.In)( // vertex is final if it is the root of a color // or it has the same color as a neighbor that is final diff --git a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala index 23c9c40594e8b..7c396e6e66a28 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/lib/TriangleCount.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.lib import scala.reflect.ClassTag @@ -44,7 +61,7 @@ object TriangleCount { (vid, _, optSet) => optSet.getOrElse(null) } // Edge function computes intersection of smaller vertex with larger vertex - def edgeFunc(et: EdgeTriplet[VertexSet, ED]): Iterator[(VertexID, Int)] = { + def edgeFunc(et: EdgeTriplet[VertexSet, ED]): Iterator[(VertexId, Int)] = { assert(et.srcAttr != null) assert(et.dstAttr != null) val (smallSet, largeSet) = if (et.srcAttr.size < et.dstAttr.size) { diff --git a/graphx/src/main/scala/org/apache/spark/graphx/package.scala b/graphx/src/main/scala/org/apache/spark/graphx/package.scala index 60dfc1dc37a53..425a5164cad24 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/package.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/package.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark import org.apache.spark.util.collection.OpenHashSet @@ -8,11 +25,11 @@ package object graphx { * A 64-bit vertex identifier that uniquely identifies a vertex within a graph. It does not need * to follow any ordering or any constraints other than uniqueness. */ - type VertexID = Long + type VertexId = Long /** Integer identifer of a graph partition. */ // TODO: Consider using Char. type PartitionID = Int - private[graphx] type VertexSet = OpenHashSet[VertexID] + private[graphx] type VertexSet = OpenHashSet[VertexId] } diff --git a/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala b/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala index 1c5b234d74791..d1528e2f07cf2 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/util/BytecodeUtils.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.util import java.io.{ByteArrayInputStream, ByteArrayOutputStream} diff --git a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala index 57422ce3f1934..7677641bfede6 100644 --- a/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala +++ b/graphx/src/main/scala/org/apache/spark/graphx/util/GraphGenerators.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.util import scala.annotation.tailrec @@ -33,7 +50,7 @@ object GraphGenerators { val mu = 4 val sigma = 1.3 - val vertices: RDD[(VertexID, Int)] = sc.parallelize(0 until numVertices).map{ + val vertices: RDD[(VertexId, Int)] = sc.parallelize(0 until numVertices).map{ src => (src, sampleLogNormal(mu, sigma, numVertices)) } val edges = vertices.flatMap { v => @@ -42,9 +59,9 @@ object GraphGenerators { Graph(vertices, edges, 0) } - def generateRandomEdges(src: Int, numEdges: Int, maxVertexID: Int): Array[Edge[Int]] = { + def generateRandomEdges(src: Int, numEdges: Int, maxVertexId: Int): Array[Edge[Int]] = { val rand = new Random() - Array.fill(maxVertexID) { Edge[Int](src, rand.nextInt(maxVertexID), 1) } + Array.fill(maxVertexId) { Edge[Int](src, rand.nextInt(maxVertexId), 1) } } /** @@ -189,9 +206,9 @@ object GraphGenerators { */ def gridGraph(sc: SparkContext, rows: Int, cols: Int): Graph[(Int,Int), Double] = { // Convert row column address into vertex ids (row major order) - def sub2ind(r: Int, c: Int): VertexID = r * cols + c + def sub2ind(r: Int, c: Int): VertexId = r * cols + c - val vertices: RDD[(VertexID, (Int,Int))] = + val vertices: RDD[(VertexId, (Int,Int))] = sc.parallelize(0 until rows).flatMap( r => (0 until cols).map( c => (sub2ind(r,c), (r,c)) ) ) val edges: RDD[Edge[Double]] = vertices.flatMap{ case (vid, (r,c)) => @@ -211,7 +228,7 @@ object GraphGenerators { * being the center vertex. */ def starGraph(sc: SparkContext, nverts: Int): Graph[Int, Int] = { - val edges: RDD[(VertexID, VertexID)] = sc.parallelize(1 until nverts).map(vid => (vid, 0)) + val edges: RDD[(VertexId, VertexId)] = sc.parallelize(1 until nverts).map(vid => (vid, 0)) Graph.fromEdgeTuples(edges, 1) } // end of starGraph diff --git a/graphx/src/test/resources/als-test.data b/graphx/src/test/resources/als-test.data new file mode 100644 index 0000000000000..e476cc23e047d --- /dev/null +++ b/graphx/src/test/resources/als-test.data @@ -0,0 +1,16 @@ +1,1,5.0 +1,2,1.0 +1,3,5.0 +1,4,1.0 +2,1,5.0 +2,2,1.0 +2,3,5.0 +2,4,1.0 +3,1,1.0 +3,2,5.0 +3,3,1.0 +3,4,5.0 +4,1,1.0 +4,2,5.0 +4,3,1.0 +4,4,5.0 diff --git a/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala index 280f50e39aa5f..bc2ad5677f806 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/GraphOpsSuite.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx import org.apache.spark.SparkContext @@ -11,12 +28,12 @@ class GraphOpsSuite extends FunSuite with LocalSparkContext { test("joinVertices") { withSpark { sc => val vertices = - sc.parallelize(Seq[(VertexID, String)]((1, "one"), (2, "two"), (3, "three")), 2) + sc.parallelize(Seq[(VertexId, String)]((1, "one"), (2, "two"), (3, "three")), 2) val edges = sc.parallelize((Seq(Edge(1, 2, "onetwo")))) val g: Graph[String, String] = Graph(vertices, edges) - val tbl = sc.parallelize(Seq[(VertexID, Int)]((1, 10), (2, 20))) - val g1 = g.joinVertices(tbl) { (vid: VertexID, attr: String, u: Int) => attr + u } + val tbl = sc.parallelize(Seq[(VertexId, Int)]((1, 10), (2, 20))) + val g1 = g.joinVertices(tbl) { (vid: VertexId, attr: String, u: Int) => attr + u } val v = g1.vertices.collect().toSet assert(v === Set((1, "one10"), (2, "two20"), (3, "three"))) @@ -43,7 +60,7 @@ class GraphOpsSuite extends FunSuite with LocalSparkContext { test ("filter") { withSpark { sc => val n = 5 - val vertices = sc.parallelize((0 to n).map(x => (x:VertexID, x))) + val vertices = sc.parallelize((0 to n).map(x => (x:VertexId, x))) val edges = sc.parallelize((1 to n).map(x => Edge(0, x, x))) val graph: Graph[Int, Int] = Graph(vertices, edges).cache() val filteredGraph = graph.filter( @@ -51,7 +68,7 @@ class GraphOpsSuite extends FunSuite with LocalSparkContext { val degrees: VertexRDD[Int] = graph.outDegrees graph.outerJoinVertices(degrees) {(vid, data, deg) => deg.getOrElse(0)} }, - vpred = (vid: VertexID, deg:Int) => deg > 0 + vpred = (vid: VertexId, deg:Int) => deg > 0 ).cache() val v = filteredGraph.vertices.collect().toSet diff --git a/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala index 9587f04c3e716..28d34dd9a1a41 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/GraphSuite.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx import org.scalatest.FunSuite @@ -10,7 +27,7 @@ import org.apache.spark.rdd._ class GraphSuite extends FunSuite with LocalSparkContext { def starGraph(sc: SparkContext, n: Int): Graph[String, Int] = { - Graph.fromEdgeTuples(sc.parallelize((1 to n).map(x => (0: VertexID, x: VertexID)), 3), "v") + Graph.fromEdgeTuples(sc.parallelize((1 to n).map(x => (0: VertexId, x: VertexId)), 3), "v") } test("Graph.fromEdgeTuples") { @@ -40,7 +57,7 @@ class GraphSuite extends FunSuite with LocalSparkContext { withSpark { sc => val rawEdges = (0L to 98L).zip((1L to 99L) :+ 0L) val edges: RDD[Edge[Int]] = sc.parallelize(rawEdges).map { case (s, t) => Edge(s, t, 1) } - val vertices: RDD[(VertexID, Boolean)] = sc.parallelize((0L until 10L).map(id => (id, true))) + val vertices: RDD[(VertexId, Boolean)] = sc.parallelize((0L until 10L).map(id => (id, true))) val graph = Graph(vertices, edges, false) assert( graph.edges.count() === rawEdges.size ) // Vertices not explicitly provided but referenced by edges should be created automatically @@ -57,7 +74,7 @@ class GraphSuite extends FunSuite with LocalSparkContext { val n = 5 val star = starGraph(sc, n) assert(star.triplets.map(et => (et.srcId, et.dstId, et.srcAttr, et.dstAttr)).collect.toSet === - (1 to n).map(x => (0: VertexID, x: VertexID, "v", "v")).toSet) + (1 to n).map(x => (0: VertexId, x: VertexId, "v", "v")).toSet) } } @@ -93,7 +110,7 @@ class GraphSuite extends FunSuite with LocalSparkContext { val p = 100 val verts = 1 to n val graph = Graph.fromEdgeTuples(sc.parallelize(verts.flatMap(x => - verts.filter(y => y % x == 0).map(y => (x: VertexID, y: VertexID))), p), 0) + verts.filter(y => y % x == 0).map(y => (x: VertexId, y: VertexId))), p), 0) assert(graph.edges.partitions.length === p) val partitionedGraph = graph.partitionBy(EdgePartition2D) assert(graph.edges.partitions.length === p) @@ -119,10 +136,10 @@ class GraphSuite extends FunSuite with LocalSparkContext { val star = starGraph(sc, n) // mapVertices preserving type val mappedVAttrs = star.mapVertices((vid, attr) => attr + "2") - assert(mappedVAttrs.vertices.collect.toSet === (0 to n).map(x => (x: VertexID, "v2")).toSet) + assert(mappedVAttrs.vertices.collect.toSet === (0 to n).map(x => (x: VertexId, "v2")).toSet) // mapVertices changing type val mappedVAttrs2 = star.mapVertices((vid, attr) => attr.length) - assert(mappedVAttrs2.vertices.collect.toSet === (0 to n).map(x => (x: VertexID, 1)).toSet) + assert(mappedVAttrs2.vertices.collect.toSet === (0 to n).map(x => (x: VertexId, 1)).toSet) } } @@ -151,7 +168,7 @@ class GraphSuite extends FunSuite with LocalSparkContext { withSpark { sc => val n = 5 val star = starGraph(sc, n) - assert(star.reverse.outDegrees.collect.toSet === (1 to n).map(x => (x: VertexID, 1)).toSet) + assert(star.reverse.outDegrees.collect.toSet === (1 to n).map(x => (x: VertexId, 1)).toSet) } } @@ -174,7 +191,7 @@ class GraphSuite extends FunSuite with LocalSparkContext { test("mask") { withSpark { sc => val n = 5 - val vertices = sc.parallelize((0 to n).map(x => (x:VertexID, x))) + val vertices = sc.parallelize((0 to n).map(x => (x:VertexId, x))) val edges = sc.parallelize((1 to n).map(x => Edge(0, x, x))) val graph: Graph[Int, Int] = Graph(vertices, edges).cache() @@ -201,7 +218,7 @@ class GraphSuite extends FunSuite with LocalSparkContext { val star = starGraph(sc, n) val doubleStar = Graph.fromEdgeTuples( sc.parallelize((1 to n).flatMap(x => - List((0: VertexID, x: VertexID), (0: VertexID, x: VertexID))), 1), "v") + List((0: VertexId, x: VertexId), (0: VertexId, x: VertexId))), 1), "v") val star2 = doubleStar.groupEdges { (a, b) => a} assert(star2.edges.collect.toArray.sorted(Edge.lexicographicOrdering[Int]) === star.edges.collect.toArray.sorted(Edge.lexicographicOrdering[Int])) @@ -220,7 +237,7 @@ class GraphSuite extends FunSuite with LocalSparkContext { assert(neighborDegreeSums.collect().toSet === (0 to n).map(x => (x, n)).toSet) // activeSetOpt - val allPairs = for (x <- 1 to n; y <- 1 to n) yield (x: VertexID, y: VertexID) + val allPairs = for (x <- 1 to n; y <- 1 to n) yield (x: VertexId, y: VertexId) val complete = Graph.fromEdgeTuples(sc.parallelize(allPairs, 3), 0) val vids = complete.mapVertices((vid, attr) => vid).cache() val active = vids.vertices.filter { case (vid, attr) => attr % 2 == 0 } @@ -231,10 +248,10 @@ class GraphSuite extends FunSuite with LocalSparkContext { } Iterator((et.srcId, 1)) }, (a: Int, b: Int) => a + b, Some((active, EdgeDirection.In))).collect.toSet - assert(numEvenNeighbors === (1 to n).map(x => (x: VertexID, n / 2)).toSet) + assert(numEvenNeighbors === (1 to n).map(x => (x: VertexId, n / 2)).toSet) // outerJoinVertices followed by mapReduceTriplets(activeSetOpt) - val ringEdges = sc.parallelize((0 until n).map(x => (x: VertexID, (x+1) % n: VertexID)), 3) + val ringEdges = sc.parallelize((0 until n).map(x => (x: VertexId, (x+1) % n: VertexId)), 3) val ring = Graph.fromEdgeTuples(ringEdges, 0) .mapVertices((vid, attr) => vid).cache() val changed = ring.vertices.filter { case (vid, attr) => attr % 2 == 1 }.mapValues(-_).cache() val changedGraph = ring.outerJoinVertices(changed) { (vid, old, newOpt) => newOpt.getOrElse(old) } @@ -245,7 +262,7 @@ class GraphSuite extends FunSuite with LocalSparkContext { } Iterator((et.dstId, 1)) }, (a: Int, b: Int) => a + b, Some(changed, EdgeDirection.Out)).collect.toSet - assert(numOddNeighbors === (2 to n by 2).map(x => (x: VertexID, 1)).toSet) + assert(numOddNeighbors === (2 to n by 2).map(x => (x: VertexId, 1)).toSet) } } @@ -260,7 +277,7 @@ class GraphSuite extends FunSuite with LocalSparkContext { val neighborDegreeSums = reverseStarDegrees.mapReduceTriplets( et => Iterator((et.srcId, et.dstAttr), (et.dstId, et.srcAttr)), (a: Int, b: Int) => a + b).collect.toSet - assert(neighborDegreeSums === Set((0: VertexID, n)) ++ (1 to n).map(x => (x: VertexID, 0))) + assert(neighborDegreeSums === Set((0: VertexId, n)) ++ (1 to n).map(x => (x: VertexId, 0))) // outerJoinVertices preserving type val messages = reverseStar.vertices.mapValues { (vid, attr) => vid.toString } val newReverseStar = diff --git a/graphx/src/test/scala/org/apache/spark/graphx/LocalSparkContext.scala b/graphx/src/test/scala/org/apache/spark/graphx/LocalSparkContext.scala index aa9ba840840e0..51f02f94e00d5 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/LocalSparkContext.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/LocalSparkContext.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx import org.scalatest.Suite diff --git a/graphx/src/test/scala/org/apache/spark/graphx/PregelSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/PregelSuite.scala index bceff11b8e6c4..490b94429ea1f 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/PregelSuite.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/PregelSuite.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx import org.scalatest.FunSuite @@ -10,7 +27,7 @@ class PregelSuite extends FunSuite with LocalSparkContext { test("1 iteration") { withSpark { sc => val n = 5 - val starEdges = (1 to n).map(x => (0: VertexID, x: VertexID)) + val starEdges = (1 to n).map(x => (0: VertexId, x: VertexId)) val star = Graph.fromEdgeTuples(sc.parallelize(starEdges, 3), "v").cache() val result = Pregel(star, 0)( (vid, attr, msg) => attr, @@ -24,12 +41,12 @@ class PregelSuite extends FunSuite with LocalSparkContext { withSpark { sc => val n = 5 val chain = Graph.fromEdgeTuples( - sc.parallelize((1 until n).map(x => (x: VertexID, x + 1: VertexID)), 3), + sc.parallelize((1 until n).map(x => (x: VertexId, x + 1: VertexId)), 3), 0).cache() - assert(chain.vertices.collect.toSet === (1 to n).map(x => (x: VertexID, 0)).toSet) + assert(chain.vertices.collect.toSet === (1 to n).map(x => (x: VertexId, 0)).toSet) val chainWithSeed = chain.mapVertices { (vid, attr) => if (vid == 1) 1 else 0 }.cache() assert(chainWithSeed.vertices.collect.toSet === - Set((1: VertexID, 1)) ++ (2 to n).map(x => (x: VertexID, 0)).toSet) + Set((1: VertexId, 1)) ++ (2 to n).map(x => (x: VertexId, 0)).toSet) val result = Pregel(chainWithSeed, 0)( (vid, attr, msg) => math.max(msg, attr), et => if (et.dstAttr != et.srcAttr) Iterator((et.dstId, et.srcAttr)) else Iterator.empty, diff --git a/graphx/src/test/scala/org/apache/spark/graphx/SerializerSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/SerializerSuite.scala index 3ba412c1f84f4..e5a582b47ba05 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/SerializerSuite.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/SerializerSuite.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx import java.io.{EOFException, ByteArrayInputStream, ByteArrayOutputStream} @@ -82,7 +99,7 @@ class SerializerSuite extends FunSuite with LocalSparkContext { test("IntAggMsgSerializer") { val conf = new SparkConf(false) - val outMsg = (4: VertexID, 5) + val outMsg = (4: VertexId, 5) val bout = new ByteArrayOutputStream val outStrm = new IntAggMsgSerializer(conf).newInstance().serializeStream(bout) outStrm.writeObject(outMsg) @@ -90,8 +107,8 @@ class SerializerSuite extends FunSuite with LocalSparkContext { bout.flush() val bin = new ByteArrayInputStream(bout.toByteArray) val inStrm = new IntAggMsgSerializer(conf).newInstance().deserializeStream(bin) - val inMsg1: (VertexID, Int) = inStrm.readObject() - val inMsg2: (VertexID, Int) = inStrm.readObject() + val inMsg1: (VertexId, Int) = inStrm.readObject() + val inMsg2: (VertexId, Int) = inStrm.readObject() assert(outMsg === inMsg1) assert(outMsg === inMsg2) @@ -102,7 +119,7 @@ class SerializerSuite extends FunSuite with LocalSparkContext { test("LongAggMsgSerializer") { val conf = new SparkConf(false) - val outMsg = (4: VertexID, 1L << 32) + val outMsg = (4: VertexId, 1L << 32) val bout = new ByteArrayOutputStream val outStrm = new LongAggMsgSerializer(conf).newInstance().serializeStream(bout) outStrm.writeObject(outMsg) @@ -110,8 +127,8 @@ class SerializerSuite extends FunSuite with LocalSparkContext { bout.flush() val bin = new ByteArrayInputStream(bout.toByteArray) val inStrm = new LongAggMsgSerializer(conf).newInstance().deserializeStream(bin) - val inMsg1: (VertexID, Long) = inStrm.readObject() - val inMsg2: (VertexID, Long) = inStrm.readObject() + val inMsg1: (VertexId, Long) = inStrm.readObject() + val inMsg2: (VertexId, Long) = inStrm.readObject() assert(outMsg === inMsg1) assert(outMsg === inMsg2) @@ -122,7 +139,7 @@ class SerializerSuite extends FunSuite with LocalSparkContext { test("DoubleAggMsgSerializer") { val conf = new SparkConf(false) - val outMsg = (4: VertexID, 5.0) + val outMsg = (4: VertexId, 5.0) val bout = new ByteArrayOutputStream val outStrm = new DoubleAggMsgSerializer(conf).newInstance().serializeStream(bout) outStrm.writeObject(outMsg) @@ -130,8 +147,8 @@ class SerializerSuite extends FunSuite with LocalSparkContext { bout.flush() val bin = new ByteArrayInputStream(bout.toByteArray) val inStrm = new DoubleAggMsgSerializer(conf).newInstance().deserializeStream(bin) - val inMsg1: (VertexID, Double) = inStrm.readObject() - val inMsg2: (VertexID, Double) = inStrm.readObject() + val inMsg1: (VertexId, Double) = inStrm.readObject() + val inMsg2: (VertexId, Double) = inStrm.readObject() assert(outMsg === inMsg1) assert(outMsg === inMsg2) diff --git a/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala index d94a3aa67c925..cc86bafd2d644 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/VertexRDDSuite.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx import org.apache.spark.SparkContext diff --git a/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala index eb82436f0964c..e135d1d7ad6a3 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/impl/EdgePartitionSuite.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.impl import scala.reflect.ClassTag @@ -62,7 +79,7 @@ class EdgePartitionSuite extends FunSuite { test("innerJoin") { def makeEdgePartition[A: ClassTag](xs: Iterable[(Int, Int, A)]): EdgePartition[A] = { val builder = new EdgePartitionBuilder[A] - for ((src, dst, attr) <- xs) { builder.add(src: VertexID, dst: VertexID, attr) } + for ((src, dst, attr) <- xs) { builder.add(src: VertexId, dst: VertexId, attr) } builder.toEdgePartition } val aList = List((0, 1, 0), (1, 0, 0), (1, 2, 0), (5, 4, 0), (5, 5, 0)) diff --git a/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala index d37d64e8c849e..a048d13fd12b8 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/impl/VertexPartitionSuite.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.impl import org.apache.spark.graphx._ diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala index 27c8705bca2ff..3915be15b3434 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/ConnectedComponentsSuite.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.lib import org.scalatest.FunSuite @@ -83,7 +100,7 @@ class ConnectedComponentsSuite extends FunSuite with LocalSparkContext { test("Connected Components on a Toy Connected Graph") { withSpark { sc => // Create an RDD for the vertices - val users: RDD[(VertexID, (String, String))] = + val users: RDD[(VertexId, (String, String))] = sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")), (5L, ("franklin", "prof")), (2L, ("istoica", "prof")), (4L, ("peter", "student")))) diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala index fe7e4261f8d03..fc491ae327c2a 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/PageRankSuite.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.lib import org.scalatest.FunSuite diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala index e173c652a53b6..e01df56e94de9 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/SVDPlusPlusSuite.scala @@ -1,12 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.lib import org.scalatest.FunSuite -import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ import org.apache.spark.graphx._ -import org.apache.spark.graphx.util.GraphGenerators -import org.apache.spark.rdd._ class SVDPlusPlusSuite extends FunSuite with LocalSparkContext { @@ -14,16 +27,16 @@ class SVDPlusPlusSuite extends FunSuite with LocalSparkContext { test("Test SVD++ with mean square error on training set") { withSpark { sc => val svdppErr = 8.0 - val edges = sc.textFile("mllib/data/als/test.data").map { line => + val edges = sc.textFile(getClass.getResource("/als-test.data").getFile).map { line => val fields = line.split(",") Edge(fields(0).toLong * 2, fields(1).toLong * 2 + 1, fields(2).toDouble) } val conf = new SVDPlusPlus.Conf(10, 2, 0.0, 5.0, 0.007, 0.007, 0.005, 0.015) // 2 iterations var (graph, u) = SVDPlusPlus.run(edges, conf) graph.cache() - val err = graph.vertices.collect.map{ case (vid, vd) => + val err = graph.vertices.collect().map{ case (vid, vd) => if (vid % 2 == 1) vd._4 else 0.0 - }.reduce(_ + _) / graph.triplets.collect.size + }.reduce(_ + _) / graph.triplets.collect().size assert(err <= svdppErr) } } diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/StronglyConnectedComponentsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/StronglyConnectedComponentsSuite.scala index 0458311661452..df54aa37cad68 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/lib/StronglyConnectedComponentsSuite.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/StronglyConnectedComponentsSuite.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.lib import org.scalatest.FunSuite diff --git a/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala index 3452ce9764752..293c7f3ba4c21 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/lib/TriangleCountSuite.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.lib import org.scalatest.FunSuite diff --git a/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala b/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala index 11db339750920..f3b3738db0dad 100644 --- a/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala +++ b/graphx/src/test/scala/org/apache/spark/graphx/util/BytecodeUtilsSuite.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.graphx.util import org.scalatest.FunSuite diff --git a/mllib/pom.xml b/mllib/pom.xml index dda3900afebdf..41600c4c4b561 100644 --- a/mllib/pom.xml +++ b/mllib/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 0.9.1-incubating-SNAPSHOT ../pom.xml diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 3fec1a909dfb9..efc0eb935376b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -24,7 +24,6 @@ import org.apache.spark.mllib.recommendation._ import org.apache.spark.rdd.RDD import java.nio.ByteBuffer import java.nio.ByteOrder -import java.nio.DoubleBuffer /** * The Java stubs necessary for the Python mllib bindings. @@ -81,7 +80,6 @@ class PythonMLLibAPI extends Serializable { } val db = bb.asDoubleBuffer() val ans = new Array[Array[Double]](rows.toInt) - var i = 0 for (i <- 0 until rows.toInt) { ans(i) = new Array[Double](cols.toInt) db.get(ans(i)) @@ -236,7 +234,7 @@ class PythonMLLibAPI extends Serializable { * Serialize a Rating object into an array of bytes. * It can be deserialized using RatingDeserializer(). * - * @param rate + * @param rate the Rating object to serialize * @return */ private[spark] def serializeRating(rate: Rating): Array[Byte] = { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala index f2964ea446ec8..6dff29dfb45cc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/classification/SVM.scala @@ -17,8 +17,6 @@ package org.apache.spark.mllib.classification -import scala.math.signum - import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.optimization._ diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala index cfc81c985aa64..980be931576dc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala @@ -19,8 +19,6 @@ package org.apache.spark.mllib.clustering import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ -import org.apache.spark.mllib.util.MLUtils - /** * A clustering model for K-means. Each point belongs to the cluster with the closest center. @@ -39,6 +37,6 @@ class KMeansModel(val clusterCenters: Array[Array[Double]]) extends Serializable * model on the given data. */ def computeCost(data: RDD[Array[Double]]): Double = { - data.map(p => KMeans.pointCost(clusterCenters, p)).sum + data.map(p => KMeans.pointCost(clusterCenters, p)).sum() } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala index 4c51f4f881f76..37124f261eeb9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/Updater.scala @@ -86,13 +86,17 @@ class L1Updater extends Updater { /** * Updater that adjusts the learning rate and performs L2 regularization + * + * See, for example, explanation of gradient and loss with L2 regularization on slide 21-22 + * of + * these slides. */ class SquaredL2Updater extends Updater { override def compute(weightsOld: DoubleMatrix, gradient: DoubleMatrix, stepSize: Double, iter: Int, regParam: Double): (DoubleMatrix, Double) = { val thisIterStepSize = stepSize / math.sqrt(iter) val normGradient = gradient.mul(thisIterStepSize) - val newWeights = weightsOld.sub(normGradient).div(2.0 * thisIterStepSize * regParam + 1.0) + val newWeights = weightsOld.mul(1.0 - 2.0 * thisIterStepSize * regParam).sub(normGradient) (newWeights, pow(newWeights.norm2, 2.0) * regParam) } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala index 89ee07063dd89..3e93402adffaf 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/recommendation/ALS.scala @@ -209,8 +209,8 @@ class ALS private (var numBlocks: Int, var rank: Int, var iterations: Int, var l def computeYtY(factors: RDD[(Int, Array[Array[Double]])]) = { if (implicitPrefs) { Option( - factors.flatMapValues{ case factorArray => - factorArray.map{ vector => + factors.flatMapValues { case factorArray => + factorArray.view.map { vector => val x = new DoubleMatrix(vector) x.mmul(x.transpose()) } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala index fe5cce064bac7..df599fde76a86 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/LinearRegression.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.regression -import org.apache.spark.{Logging, SparkContext} +import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.util.MLUtils diff --git a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala index c125c6797ada3..0c0e67fb7b123 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/regression/RidgeRegression.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.regression -import org.apache.spark.{Logging, SparkContext} +import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.optimization._ import org.apache.spark.mllib.util.MLUtils @@ -76,7 +76,7 @@ class RidgeRegressionWithSGD private ( def createModel(weights: Array[Double], intercept: Double) = { val weightsMat = new DoubleMatrix(weights.length + 1, 1, (Array(intercept) ++ weights):_*) val weightsScaled = weightsMat.div(xColSd) - val interceptScaled = yMean - (weightsMat.transpose().mmul(xColMean.div(xColSd)).get(0)) + val interceptScaled = yMean - weightsMat.transpose().mmul(xColMean.div(xColSd)).get(0) new RidgeRegressionModel(weightsScaled.data, interceptScaled) } @@ -86,7 +86,7 @@ class RidgeRegressionWithSGD private ( initialWeights: Array[Double]) : RidgeRegressionModel = { - val nfeatures: Int = input.first.features.length + val nfeatures: Int = input.first().features.length val nexamples: Long = input.count() // To avoid penalizing the intercept, we center and scale the data. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala index bc5045fb05d3b..2e03684e62861 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala @@ -25,7 +25,6 @@ import org.jblas.DoubleMatrix import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.regression.LabeledPoint /** * Generate sample data used for Linear Data. This class generates @@ -73,7 +72,7 @@ object LinearDataGenerator { val x = Array.fill[Array[Double]](nPoints)( Array.fill[Double](weights.length)(2 * rnd.nextDouble - 1.0)) val y = x.map { xi => - (new DoubleMatrix(1, xi.length, xi:_*)).dot(weightsMat) + intercept + eps * rnd.nextGaussian() + new DoubleMatrix(1, xi.length, xi: _*).dot(weightsMat) + intercept + eps * rnd.nextGaussian() } y.zip(x).map(p => LabeledPoint(p._1, p._2)) } @@ -86,7 +85,6 @@ object LinearDataGenerator { * @param nexamples Number of examples that will be contained in the RDD. * @param nfeatures Number of features to generate for each example. * @param eps Epsilon factor by which examples are scaled. - * @param weights Weights associated with the first weights.length features. * @param nparts Number of partitions in the RDD. Default value is 2. * * @return RDD of LabeledPoint containing sample data. diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala index d5f3f6b8dbeea..348aba1dea5b6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MFDataGenerator.scala @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.spark.mllib.recommendation +package org.apache.spark.mllib.util import scala.util.Random @@ -23,7 +23,6 @@ import org.jblas.DoubleMatrix import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD -import org.apache.spark.mllib.util.MLUtils /** * Generate RDD(s) containing data for Matrix Factorization. @@ -31,9 +30,9 @@ import org.apache.spark.mllib.util.MLUtils * This method samples training entries according to the oversampling factor * 'trainSampFact', which is a multiplicative factor of the number of * degrees of freedom of the matrix: rank*(m+n-rank). -* -* It optionally samples entries for a testing matrix using -* 'testSampFact', the percentage of the number of training entries +* +* It optionally samples entries for a testing matrix using +* 'testSampFact', the percentage of the number of training entries * to use for testing. * * This method takes the following inputs: @@ -73,7 +72,7 @@ object MFDataGenerator{ val A = DoubleMatrix.randn(m, rank) val B = DoubleMatrix.randn(rank, n) - val z = 1 / (scala.math.sqrt(scala.math.sqrt(rank))) + val z = 1 / scala.math.sqrt(scala.math.sqrt(rank)) A.mmuli(z) B.mmuli(z) val fullData = A.mmul(B) @@ -91,7 +90,7 @@ object MFDataGenerator{ .map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1))) // optionally add gaussian noise - if (noise) { + if (noise) { trainData.map(x => (x._1, x._2, x._3 + rand.nextGaussian * sigma)) } @@ -107,8 +106,8 @@ object MFDataGenerator{ .map(x => (fullData.indexRows(x - 1), fullData.indexColumns(x - 1), fullData.get(x - 1))) testData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath) } - + sc.stop() - + } } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index d91b74c3ac2b3..64c6136a8b89d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -97,7 +97,7 @@ object MLUtils { while (col < nfeatures) { xColMean.put(col, xColSumsMap(col)._1 / nexamples) val variance = - (xColSumsMap(col)._2 - (math.pow(xColSumsMap(col)._1, 2) / nexamples)) / (nexamples) + (xColSumsMap(col)._2 - (math.pow(xColSumsMap(col)._1, 2) / nexamples)) / nexamples xColSd.put(col, math.sqrt(variance)) col += 1 } diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala index 07022093f300c..c96c94f70eef7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/SVMDataGenerator.scala @@ -56,7 +56,7 @@ object SVMDataGenerator { val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } - val yD = (new DoubleMatrix(1, x.length, x:_*)).dot(trueWeights) + rnd.nextGaussian() * 0.1 + val yD = new DoubleMatrix(1, x.length, x: _*).dot(trueWeights) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, x) } diff --git a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java index 23ea3548b95b6..073ded6f36933 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/classification/JavaNaiveBayesSuite.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.mllib.classification; import org.apache.spark.api.java.JavaRDD; diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala index 34c67294e9ac9..02ede711372d3 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/LogisticRegressionSuite.scala @@ -80,9 +80,9 @@ class LogisticRegressionSuite extends FunSuite with BeforeAndAfterAll with Shoul } def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) { - val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) => - (prediction != expected.label) - }.size + val numOffPredictions = predictions.zip(input).count { case (prediction, expected) => + prediction != expected.label + } // At least 83% of the predictions should be on. ((input.length - numOffPredictions).toDouble / input.length) should be > 0.83 } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala index 6a957e3ddca71..3357b86f9b706 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/classification/SVMSuite.scala @@ -18,7 +18,6 @@ package org.apache.spark.mllib.classification import scala.util.Random -import scala.math.signum import scala.collection.JavaConversions._ import org.scalatest.BeforeAndAfterAll @@ -50,7 +49,7 @@ object SVMSuite { val x = Array.fill[Array[Double]](nPoints)( Array.fill[Double](weights.length)(rnd.nextDouble() * 2.0 - 1.0)) val y = x.map { xi => - val yD = (new DoubleMatrix(1, xi.length, xi:_*)).dot(weightsMat) + + val yD = new DoubleMatrix(1, xi.length, xi: _*).dot(weightsMat) + intercept + 0.01 * rnd.nextGaussian() if (yD < 0) 0.0 else 1.0 } @@ -72,9 +71,9 @@ class SVMSuite extends FunSuite with BeforeAndAfterAll { } def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) { - val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) => - (prediction != expected.label) - }.size + val numOffPredictions = predictions.zip(input).count { case (prediction, expected) => + prediction != expected.label + } // At least 80% of the predictions should be on. assert(numOffPredictions < input.length / 5) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala index 94245f6027b30..73657cac893ce 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/KMeansSuite.scala @@ -17,15 +17,12 @@ package org.apache.spark.mllib.clustering -import scala.util.Random import org.scalatest.BeforeAndAfterAll import org.scalatest.FunSuite import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ -import org.jblas._ class KMeansSuite extends FunSuite with BeforeAndAfterAll { @transient private var sc: SparkContext = _ diff --git a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala index e683a90f57aba..4e8dbde65801c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/recommendation/ALSSuite.scala @@ -24,7 +24,6 @@ import org.scalatest.BeforeAndAfterAll import org.scalatest.FunSuite import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ import org.jblas._ diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala index db980c7bae64f..b2c8df97a82a7 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LassoSuite.scala @@ -17,8 +17,6 @@ package org.apache.spark.mllib.regression -import scala.collection.JavaConversions._ -import scala.util.Random import org.scalatest.BeforeAndAfterAll import org.scalatest.FunSuite @@ -41,10 +39,10 @@ class LassoSuite extends FunSuite with BeforeAndAfterAll { } def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) { - val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) => + val numOffPredictions = predictions.zip(input).count { case (prediction, expected) => // A prediction is off if the prediction is more than 0.5 away from expected value. math.abs(prediction - expected.label) > 0.5 - }.size + } // At least 80% of the predictions should be on. assert(numOffPredictions < input.length / 5) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala index ef500c704c8a9..406afbaa3e2c1 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/LinearRegressionSuite.scala @@ -21,7 +21,6 @@ import org.scalatest.BeforeAndAfterAll import org.scalatest.FunSuite import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ import org.apache.spark.mllib.util.LinearDataGenerator class LinearRegressionSuite extends FunSuite with BeforeAndAfterAll { @@ -37,10 +36,10 @@ class LinearRegressionSuite extends FunSuite with BeforeAndAfterAll { } def validatePrediction(predictions: Seq[Double], input: Seq[LabeledPoint]) { - val numOffPredictions = predictions.zip(input).filter { case (prediction, expected) => + val numOffPredictions = predictions.zip(input).count { case (prediction, expected) => // A prediction is off if the prediction is more than 0.5 away from expected value. math.abs(prediction - expected.label) > 0.5 - }.size + } // At least 80% of the predictions should be on. assert(numOffPredictions < input.length / 5) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala index c18092d804fa3..1d6a10b66e892 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/regression/RidgeRegressionSuite.scala @@ -17,15 +17,12 @@ package org.apache.spark.mllib.regression -import scala.collection.JavaConversions._ -import scala.util.Random import org.jblas.DoubleMatrix import org.scalatest.BeforeAndAfterAll import org.scalatest.FunSuite import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ import org.apache.spark.mllib.util.LinearDataGenerator class RidgeRegressionSuite extends FunSuite with BeforeAndAfterAll { diff --git a/pom.xml b/pom.xml index b25d9d7ef891d..e53c930ad0aa4 100644 --- a/pom.xml +++ b/pom.xml @@ -25,7 +25,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 0.9.1-incubating-SNAPSHOT pom Spark Project Parent POM http://spark.incubator.apache.org/ @@ -389,7 +389,7 @@ com.novocode junit-interface - 0.9 + 0.10 test @@ -727,6 +727,14 @@ + + + release + + true + + yarn-alpha @@ -735,11 +743,9 @@ 0.23.7 - yarn - @@ -752,17 +758,15 @@ yarn - - + + - repl-bin - - false - + spark-ganglia-lgpl - repl-bin + extras/spark-ganglia-lgpl + diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index a9f9937cb168c..f9eeeb0e2821d 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -61,10 +61,16 @@ object SparkBuild extends Build { lazy val mllib = Project("mllib", file("mllib"), settings = mllibSettings) dependsOn(core) lazy val assemblyProj = Project("assembly", file("assembly"), settings = assemblyProjSettings) - .dependsOn(core, graphx, bagel, mllib, repl, streaming) dependsOn(maybeYarn: _*) + .dependsOn(core, graphx, bagel, mllib, repl, streaming) dependsOn(maybeYarn: _*) dependsOn(maybeGanglia: _*) lazy val assembleDeps = TaskKey[Unit]("assemble-deps", "Build assembly of dependencies and packages Spark projects") + // A dummy command so we can run the Jenkins pull request builder for older versions of Spark. + lazy val scalastyle = TaskKey[Unit]("scalastyle", "Dummy scalastyle check") + val scalastyleTask = scalastyle := { + println("scalastyle is not configured for this version of Spark project.") + } + // A configuration to set an alternative publishLocalConfiguration lazy val MavenCompile = config("m2r") extend(Compile) lazy val publishLocalBoth = TaskKey[Unit]("publish-local", "publish local for m2 and ivy") @@ -84,13 +90,21 @@ object SparkBuild extends Build { case None => DEFAULT_YARN case Some(v) => v.toBoolean } - - // Conditionally include the yarn sub-project + lazy val hadoopClient = if (hadoopVersion.startsWith("0.20.") || hadoopVersion == "1.0.0") "hadoop-core" else "hadoop-client" + + // Include Ganglia integration if the user has enabled Ganglia + // This is isolated from the normal build due to LGPL-licensed code in the library + lazy val isGangliaEnabled = Properties.envOrNone("SPARK_GANGLIA_LGPL").isDefined + lazy val gangliaProj = Project("spark-ganglia-lgpl", file("extras/spark-ganglia-lgpl"), settings = gangliaSettings).dependsOn(core) + val maybeGanglia: Seq[ClasspathDependency] = if (isGangliaEnabled) Seq(gangliaProj) else Seq() + val maybeGangliaRef: Seq[ProjectReference] = if (isGangliaEnabled) Seq(gangliaProj) else Seq() + + // Include the YARN project if the user has enabled YARN lazy val yarnAlpha = Project("yarn-alpha", file("yarn/alpha"), settings = yarnAlphaSettings) dependsOn(core) lazy val yarn = Project("yarn", file("yarn/stable"), settings = yarnSettings) dependsOn(core) - lazy val maybeYarn = if (isYarnEnabled) Seq[ClasspathDependency](if (isNewHadoop) yarn else yarnAlpha) else Seq[ClasspathDependency]() - lazy val maybeYarnRef = if (isYarnEnabled) Seq[ProjectReference](if (isNewHadoop) yarn else yarnAlpha) else Seq[ProjectReference]() + lazy val maybeYarn: Seq[ClasspathDependency] = if (isYarnEnabled) Seq(if (isNewHadoop) yarn else yarnAlpha) else Seq() + lazy val maybeYarnRef: Seq[ProjectReference] = if (isYarnEnabled) Seq(if (isNewHadoop) yarn else yarnAlpha) else Seq() lazy val externalTwitter = Project("external-twitter", file("external/twitter"), settings = twitterSettings) .dependsOn(streaming % "compile->compile;test->test") @@ -114,13 +128,13 @@ object SparkBuild extends Build { .dependsOn(core, mllib, graphx, bagel, streaming, externalTwitter) dependsOn(allExternal: _*) // Everything except assembly, tools and examples belong to packageProjects - lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx) ++ maybeYarnRef + lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx) ++ maybeYarnRef ++ maybeGangliaRef lazy val allProjects = packageProjects ++ allExternalRefs ++ Seq[ProjectReference](examples, tools, assemblyProj) def sharedSettings = Defaults.defaultSettings ++ Seq( organization := "org.apache.spark", - version := "0.9.0-incubating-SNAPSHOT", + version := "0.9.0-incubating", scalaVersion := "2.10.3", scalacOptions := Seq("-Xmax-classfile-name", "120", "-unchecked", "-deprecation", "-target:" + SCALAC_JVM_VERSION), @@ -212,12 +226,13 @@ object SparkBuild extends Build { "org.eclipse.jetty.orbit" % "javax.servlet" % "2.5.0.v201103041518" artifacts Artifact("javax.servlet", "jar", "jar"), "org.scalatest" %% "scalatest" % "1.9.1" % "test", "org.scalacheck" %% "scalacheck" % "1.10.0" % "test", - "com.novocode" % "junit-interface" % "0.9" % "test", + "com.novocode" % "junit-interface" % "0.10" % "test", "org.easymock" % "easymock" % "3.1" % "test", "org.mockito" % "mockito-all" % "1.8.5" % "test", "commons-io" % "commons-io" % "2.4" % "test" ), + testOptions += Tests.Argument(TestFrameworks.JUnit, "-v", "-a"), parallelExecution := true, /* Workaround for issue #206 (fixed after SBT 0.11.0) */ watchTransitiveSources <<= Defaults.inDependencies[Task[Seq[File]]](watchSources.task, @@ -266,18 +281,16 @@ object SparkBuild extends Build { "org.apache.mesos" % "mesos" % "0.13.0", "net.java.dev.jets3t" % "jets3t" % "0.7.1", "org.apache.derby" % "derby" % "10.4.2.0" % "test", - "org.apache.hadoop" % "hadoop-client" % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm, excludeCglib), + "org.apache.hadoop" % hadoopClient % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm, excludeCglib), "org.apache.avro" % "avro" % "1.7.4", "org.apache.avro" % "avro-ipc" % "1.7.4" excludeAll(excludeNetty), "org.apache.zookeeper" % "zookeeper" % "3.4.5" excludeAll(excludeNetty), "com.codahale.metrics" % "metrics-core" % "3.0.0", "com.codahale.metrics" % "metrics-jvm" % "3.0.0", "com.codahale.metrics" % "metrics-json" % "3.0.0", - "com.codahale.metrics" % "metrics-ganglia" % "3.0.0", "com.codahale.metrics" % "metrics-graphite" % "3.0.0", "com.twitter" %% "chill" % "0.3.1", "com.twitter" % "chill-java" % "0.3.1", - "com.typesafe" % "config" % "1.0.2", "com.clearspring.analytics" % "stream" % "2.5.1" ) ) @@ -317,7 +330,10 @@ object SparkBuild extends Build { ) ++ assemblySettings ++ extraAssemblySettings def graphxSettings = sharedSettings ++ Seq( - name := "spark-graphx" + name := "spark-graphx", + libraryDependencies ++= Seq( + "org.apache.commons" % "commons-math3" % "3.2" + ) ) def bagelSettings = sharedSettings ++ Seq( @@ -361,6 +377,11 @@ object SparkBuild extends Build { name := "spark-yarn" ) + def gangliaSettings = sharedSettings ++ Seq( + name := "spark-ganglia-lgpl", + libraryDependencies += "com.codahale.metrics" % "metrics-ganglia" % "3.0.0" + ) + // Conditionally include the YARN dependencies because some tools look at all sub-projects and will complain // if we refer to nonexistent dependencies (e.g. hadoop-yarn-api from a Hadoop version without YARN). def extraYarnSettings = if(isYarnEnabled) yarnEnabledSettings else Seq() @@ -368,7 +389,7 @@ object SparkBuild extends Build { def yarnEnabledSettings = Seq( libraryDependencies ++= Seq( // Exclude rule required for all ? - "org.apache.hadoop" % "hadoop-client" % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm, excludeCglib), + "org.apache.hadoop" % hadoopClient % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm, excludeCglib), "org.apache.hadoop" % "hadoop-yarn-api" % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm, excludeCglib), "org.apache.hadoop" % "hadoop-yarn-common" % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm, excludeCglib), "org.apache.hadoop" % "hadoop-yarn-client" % hadoopVersion excludeAll(excludeJackson, excludeNetty, excludeAsm, excludeCglib) @@ -378,6 +399,7 @@ object SparkBuild extends Build { def assemblyProjSettings = sharedSettings ++ Seq( libraryDependencies += "net.sf.py4j" % "py4j" % "0.8.1", name := "spark-assembly", + scalastyleTask, assembleDeps in Compile <<= (packageProjects.map(packageBin in Compile in _) ++ Seq(packageDependency in Compile)).dependOn, jarName in assembly <<= version map { v => "spark-assembly-" + v + "-hadoop" + hadoopVersion + ".jar" }, jarName in packageDependency <<= version map { v => "spark-assembly-" + v + "-hadoop" + hadoopVersion + "-deps.jar" } diff --git a/project/project/SparkPluginBuild.scala b/project/project/SparkPluginBuild.scala index 6a66bd1d06b23..4853be2617684 100644 --- a/project/project/SparkPluginBuild.scala +++ b/project/project/SparkPluginBuild.scala @@ -20,5 +20,5 @@ import sbt._ object SparkPluginDef extends Build { lazy val root = Project("plugins", file(".")) dependsOn(junitXmlListener) /* This is not published in a Maven repository, so we get it from GitHub directly */ - lazy val junitXmlListener = uri("git://github.com/ijuma/junit_xml_listener.git#fe434773255b451a38e8d889536ebc260f4225ce") + lazy val junitXmlListener = uri("https://github.com/ijuma/junit_xml_listener.git#fe434773255b451a38e8d889536ebc260f4225ce") } diff --git a/repl-bin/src/deb/bin/spark-executor b/python/examples/sort.py similarity index 54% rename from repl-bin/src/deb/bin/spark-executor rename to python/examples/sort.py index 052d76fb8d81c..5de20a6d98f43 100755 --- a/repl-bin/src/deb/bin/spark-executor +++ b/python/examples/sort.py @@ -1,5 +1,3 @@ -#!/usr/bin/env bash - # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with @@ -17,6 +15,22 @@ # limitations under the License. # -FWDIR="$(cd `dirname $0`; pwd)" -echo "Running spark-executor with framework dir = $FWDIR" -exec $FWDIR/run org.apache.spark.executor.MesosExecutorBackend +import sys + +from pyspark import SparkContext + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print >> sys.stderr, "Usage: sort " + exit(-1) + sc = SparkContext(sys.argv[1], "PythonSort") + lines = sc.textFile(sys.argv[2], 1) + sortedCount = lines.flatMap(lambda x: x.split(' ')) \ + .map(lambda x: (int(x), 1)) \ + .sortByKey(lambda x: x) + # This is just a demo on how to bring all the sorted data back to a single node. + # In reality, we wouldn't want to collect all the data to the driver node. + output = sortedCount.collect() + for (num, unitcount) in output: + print num diff --git a/python/pyspark/conf.py b/python/pyspark/conf.py index d72aed6a30ec1..3870cd8f2b097 100644 --- a/python/pyspark/conf.py +++ b/python/pyspark/conf.py @@ -61,14 +61,12 @@ class SparkConf(object): Most of the time, you would create a SparkConf object with C{SparkConf()}, which will load values from C{spark.*} Java system - properties and any C{spark.conf} on your Spark classpath. In this - case, system properties take priority over C{spark.conf}, and any - parameters you set directly on the C{SparkConf} object take priority - over both of those. + properties as well. In this case, any parameters you set directly on + the C{SparkConf} object take priority over system properties. For unit tests, you can also call C{SparkConf(false)} to skip loading external settings and get the same configuration no matter - what is on the classpath. + what the system properties are. All setter methods in this class support chaining. For example, you can write C{conf.setMaster("local").setAppName("My app")}. @@ -82,7 +80,7 @@ def __init__(self, loadDefaults=True, _jvm=None): Create a new Spark configuration. @param loadDefaults: whether to load values from Java system - properties and classpath (True by default) + properties (True by default) @param _jvm: internal parameter used to pass a handle to the Java VM; does not need to be set by users """ diff --git a/python/pyspark/context.py b/python/pyspark/context.py index f955aad7a4f12..f318b5d9a73d7 100644 --- a/python/pyspark/context.py +++ b/python/pyspark/context.py @@ -27,7 +27,7 @@ from pyspark.conf import SparkConf from pyspark.files import SparkFiles from pyspark.java_gateway import launch_gateway -from pyspark.serializers import PickleSerializer, BatchedSerializer, MUTF8Deserializer +from pyspark.serializers import PickleSerializer, BatchedSerializer, UTF8Deserializer from pyspark.storagelevel import StorageLevel from pyspark.rdd import RDD @@ -234,7 +234,7 @@ def textFile(self, name, minSplits=None): """ minSplits = minSplits or min(self.defaultParallelism, 2) return RDD(self._jsc.textFile(name, minSplits), self, - MUTF8Deserializer()) + UTF8Deserializer()) def _checkpointFile(self, name, input_deserializer): jrdd = self._jsc.checkpointFile(name) diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py index b1a5df109b46e..b420d7a7f23ba 100644 --- a/python/pyspark/mllib/__init__.py +++ b/python/pyspark/mllib/__init__.py @@ -18,3 +18,13 @@ """ Python bindings for MLlib. """ + +# MLlib currently needs Python 2.7+ and NumPy 1.7+, so complain if lower + +import sys +if sys.version_info[0:2] < (2, 7): + raise Exception("MLlib requires Python 2.7+") + +import numpy +if numpy.version.version < '1.7': + raise Exception("MLlib requires NumPy 1.7+") diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py index 6fb4a7b3be25d..678b005414c0d 100644 --- a/python/pyspark/rdd.py +++ b/python/pyspark/rdd.py @@ -27,6 +27,8 @@ from subprocess import Popen, PIPE from tempfile import NamedTemporaryFile from threading import Thread +import warnings +from heapq import heappush, heappop, heappushpop from pyspark.serializers import NoOpSerializer, CartesianDeserializer, \ BatchedSerializer, CloudPickleSerializer, pack_long @@ -162,7 +164,7 @@ def getCheckpointFile(self): def map(self, f, preservesPartitioning=False): """ - Return a new RDD containing the distinct elements in this RDD. + Return a new RDD by applying a function to each element of this RDD. """ def func(split, iterator): return imap(f, iterator) return PipelinedRDD(self, func, preservesPartitioning) @@ -179,7 +181,7 @@ def flatMap(self, f, preservesPartitioning=False): [(2, 2), (2, 2), (3, 3), (3, 3), (4, 4), (4, 4)] """ def func(s, iterator): return chain.from_iterable(imap(f, iterator)) - return self.mapPartitionsWithSplit(func, preservesPartitioning) + return self.mapPartitionsWithIndex(func, preservesPartitioning) def mapPartitions(self, f, preservesPartitioning=False): """ @@ -191,10 +193,24 @@ def mapPartitions(self, f, preservesPartitioning=False): [3, 7] """ def func(s, iterator): return f(iterator) - return self.mapPartitionsWithSplit(func) + return self.mapPartitionsWithIndex(func) + + def mapPartitionsWithIndex(self, f, preservesPartitioning=False): + """ + Return a new RDD by applying a function to each partition of this RDD, + while tracking the index of the original partition. + + >>> rdd = sc.parallelize([1, 2, 3, 4], 4) + >>> def f(splitIndex, iterator): yield splitIndex + >>> rdd.mapPartitionsWithIndex(f).sum() + 6 + """ + return PipelinedRDD(self, f, preservesPartitioning) def mapPartitionsWithSplit(self, f, preservesPartitioning=False): """ + Deprecated: use mapPartitionsWithIndex instead. + Return a new RDD by applying a function to each partition of this RDD, while tracking the index of the original partition. @@ -203,7 +219,9 @@ def mapPartitionsWithSplit(self, f, preservesPartitioning=False): >>> rdd.mapPartitionsWithSplit(f).sum() 6 """ - return PipelinedRDD(self, f, preservesPartitioning) + warnings.warn("mapPartitionsWithSplit is deprecated; " + "use mapPartitionsWithIndex instead", DeprecationWarning, stacklevel=2) + return self.mapPartitionsWithIndex(f, preservesPartitioning) def filter(self, f): """ @@ -235,7 +253,7 @@ def sample(self, withReplacement, fraction, seed): >>> sc.parallelize(range(0, 100)).sample(False, 0.1, 2).collect() #doctest: +SKIP [2, 3, 20, 21, 24, 41, 42, 66, 67, 89, 90, 98] """ - return self.mapPartitionsWithSplit(RDDSampler(withReplacement, fraction, seed).func, True) + return self.mapPartitionsWithIndex(RDDSampler(withReplacement, fraction, seed).func, True) # this is ported from scala/spark/RDD.scala def takeSample(self, withReplacement, num, seed): @@ -599,6 +617,30 @@ def mergeMaps(m1, m2): m1[k] += v return m1 return self.mapPartitions(countPartition).reduce(mergeMaps) + + def top(self, num): + """ + Get the top N elements from a RDD. + + Note: It returns the list sorted in ascending order. + >>> sc.parallelize([10, 4, 2, 12, 3]).top(1) + [12] + >>> sc.parallelize([2, 3, 4, 5, 6]).cache().top(2) + [5, 6] + """ + def topIterator(iterator): + q = [] + for k in iterator: + if len(q) < num: + heappush(q, k) + else: + heappushpop(q, k) + yield q + + def merge(a, b): + return next(topIterator(a + b)) + + return sorted(self.mapPartitions(topIterator).reduce(merge)) def take(self, num): """ diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py index 2a500ab919bea..8c6ad79059c23 100644 --- a/python/pyspark/serializers.py +++ b/python/pyspark/serializers.py @@ -261,13 +261,13 @@ class MarshalSerializer(FramedSerializer): loads = marshal.loads -class MUTF8Deserializer(Serializer): +class UTF8Deserializer(Serializer): """ - Deserializes streams written by Java's DataOutputStream.writeUTF(). + Deserializes streams written by getBytes. """ def loads(self, stream): - length = struct.unpack('>H', stream.read(2))[0] + length = read_int(stream) return stream.read(length).decode('utf8') def load_stream(self, stream): diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py index 1602227a273e7..920334205c13e 100644 --- a/python/pyspark/shell.py +++ b/python/pyspark/shell.py @@ -35,7 +35,7 @@ ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ - /__ / .__/\_,_/_/ /_/\_\ version 0.9.0-SNAPSHOT + /__ / .__/\_,_/_/ /_/\_\ version 0.9.0 /_/ """ print "Using Python version %s (%s, %s)" % ( diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py index 7acb6eaf10931..527104587fd31 100644 --- a/python/pyspark/tests.py +++ b/python/pyspark/tests.py @@ -152,6 +152,33 @@ def test_save_as_textfile_with_unicode(self): raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) + def test_transforming_cartesian_result(self): + # Regression test for SPARK-1034 + rdd1 = self.sc.parallelize([1, 2]) + rdd2 = self.sc.parallelize([3, 4]) + cart = rdd1.cartesian(rdd2) + result = cart.map(lambda (x, y): x + y).collect() + + def test_cartesian_on_textfile(self): + # Regression test for + path = os.path.join(SPARK_HOME, "python/test_support/hello.txt") + a = self.sc.textFile(path) + result = a.cartesian(a).collect() + (x, y) = result[0] + self.assertEqual("Hello World!", x.strip()) + self.assertEqual("Hello World!", y.strip()) + + def test_deleting_input_files(self): + # Regression test for SPARK-1025 + tempFile = NamedTemporaryFile(delete=False) + tempFile.write("Hello World!") + tempFile.close() + data = self.sc.textFile(tempFile.name) + filtered_data = data.filter(lambda x: True) + self.assertEqual(1, filtered_data.count()) + os.unlink(tempFile.name) + self.assertRaises(Exception, lambda: filtered_data.count()) + class TestIO(PySparkTestCase): diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index d77981f61fa36..4e47d02965c4d 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -30,11 +30,11 @@ from pyspark.cloudpickle import CloudPickler from pyspark.files import SparkFiles from pyspark.serializers import write_with_length, write_int, read_long, \ - write_long, read_int, SpecialLengths, MUTF8Deserializer, PickleSerializer + write_long, read_int, SpecialLengths, UTF8Deserializer, PickleSerializer pickleSer = PickleSerializer() -mutf8_deserializer = MUTF8Deserializer() +utf8_deserializer = UTF8Deserializer() def report_times(outfile, boot, init, finish): @@ -45,34 +45,34 @@ def report_times(outfile, boot, init, finish): def main(infile, outfile): - boot_time = time.time() - split_index = read_int(infile) - if split_index == -1: # for unit tests - return + try: + boot_time = time.time() + split_index = read_int(infile) + if split_index == -1: # for unit tests + return - # fetch name of workdir - spark_files_dir = mutf8_deserializer.loads(infile) - SparkFiles._root_directory = spark_files_dir - SparkFiles._is_running_on_worker = True + # fetch name of workdir + spark_files_dir = utf8_deserializer.loads(infile) + SparkFiles._root_directory = spark_files_dir + SparkFiles._is_running_on_worker = True - # fetch names and values of broadcast variables - num_broadcast_variables = read_int(infile) - for _ in range(num_broadcast_variables): - bid = read_long(infile) - value = pickleSer._read_with_length(infile) - _broadcastRegistry[bid] = Broadcast(bid, value) + # fetch names and values of broadcast variables + num_broadcast_variables = read_int(infile) + for _ in range(num_broadcast_variables): + bid = read_long(infile) + value = pickleSer._read_with_length(infile) + _broadcastRegistry[bid] = Broadcast(bid, value) - # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH - sys.path.append(spark_files_dir) # *.py files that were added will be copied here - num_python_includes = read_int(infile) - for _ in range(num_python_includes): - filename = mutf8_deserializer.loads(infile) - sys.path.append(os.path.join(spark_files_dir, filename)) + # fetch names of includes (*.zip and *.egg files) and construct PYTHONPATH + sys.path.append(spark_files_dir) # *.py files that were added will be copied here + num_python_includes = read_int(infile) + for _ in range(num_python_includes): + filename = utf8_deserializer.loads(infile) + sys.path.append(os.path.join(spark_files_dir, filename)) - command = pickleSer._read_with_length(infile) - (func, deserializer, serializer) = command - init_time = time.time() - try: + command = pickleSer._read_with_length(infile) + (func, deserializer, serializer) = command + init_time = time.time() iterator = deserializer.load_stream(infile) serializer.dump_stream(func(split_index, iterator), outfile) except Exception as e: diff --git a/python/run-tests b/python/run-tests index 2005f610b43b4..a986ac9380be4 100755 --- a/python/run-tests +++ b/python/run-tests @@ -40,11 +40,11 @@ run_test "-m doctest pyspark/broadcast.py" run_test "-m doctest pyspark/accumulators.py" run_test "-m doctest pyspark/serializers.py" run_test "pyspark/tests.py" -#run_test "pyspark/mllib/_common.py" -#run_test "pyspark/mllib/classification.py" -#run_test "pyspark/mllib/clustering.py" -#run_test "pyspark/mllib/recommendation.py" -#run_test "pyspark/mllib/regression.py" +run_test "pyspark/mllib/_common.py" +run_test "pyspark/mllib/classification.py" +run_test "pyspark/mllib/clustering.py" +run_test "pyspark/mllib/recommendation.py" +run_test "pyspark/mllib/regression.py" if [[ $FAILED != 0 ]]; then echo -en "\033[31m" # Red diff --git a/repl-bin/pom.xml b/repl-bin/pom.xml deleted file mode 100644 index 869dbdb9b095a..0000000000000 --- a/repl-bin/pom.xml +++ /dev/null @@ -1,184 +0,0 @@ - - - - - 4.0.0 - - org.apache.spark - spark-parent - 0.9.0-incubating-SNAPSHOT - ../pom.xml - - - org.apache.spark - spark-repl-bin_2.10 - pom - Spark Project REPL binary packaging - http://spark.incubator.apache.org/ - - - spark - /usr/share/spark - root - - - - - org.apache.spark - spark-core_${scala.binary.version} - ${project.version} - - - org.apache.spark - spark-bagel_${scala.binary.version} - ${project.version} - runtime - - - org.apache.spark - spark-repl_${scala.binary.version} - ${project.version} - runtime - - - - - - - org.apache.maven.plugins - maven-shade-plugin - - false - ${project.build.directory}/${project.artifactId}-${project.version}-shaded.jar - - - *:* - - - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - package - - shade - - - - - - reference.conf - - - spark.repl.Main - - - - - - - - - - - - deb - - - - org.codehaus.mojo - buildnumber-maven-plugin - 1.1 - - - validate - - create - - - 8 - - - - - - org.vafer - jdeb - 0.11 - - - package - - jdeb - - - ${project.build.directory}/${deb.pkg.name}_${project.version}-${buildNumber}_all.deb - false - gzip - - - ${project.build.directory}/${project.artifactId}-${project.version}-shaded.jar - file - - perm - ${deb.user} - ${deb.user} - ${deb.install.path} - - - - ${basedir}/src/deb/bin - directory - - perm - ${deb.user} - ${deb.user} - ${deb.install.path} - 744 - - - - ${basedir}/../conf - directory - - perm - ${deb.user} - ${deb.user} - ${deb.install.path}/conf - 744 - - - - - - - - - - - - diff --git a/repl-bin/src/deb/bin/run b/repl-bin/src/deb/bin/run deleted file mode 100755 index 3a6f22f41fca5..0000000000000 --- a/repl-bin/src/deb/bin/run +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env bash - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -SCALA_VERSION=2.10 - -# Figure out where the Scala framework is installed -FWDIR="$(cd `dirname $0`; pwd)" - -# Export this as SPARK_HOME -export SPARK_HOME="$FWDIR" - -# Load environment variables from conf/spark-env.sh, if it exists -if [ -e $FWDIR/conf/spark-env.sh ] ; then - . $FWDIR/conf/spark-env.sh -fi - -# Figure out how much memory to use per executor and set it as an environment -# variable so that our process sees it and can report it to Mesos -if [ -z "$SPARK_MEM" ] ; then - SPARK_MEM="512m" -fi -export SPARK_MEM - -# Set JAVA_OPTS to be able to load native libraries and to set heap size -JAVA_OPTS="$SPARK_JAVA_OPTS" -JAVA_OPTS+=" -Djava.library.path=$SPARK_LIBRARY_PATH" -JAVA_OPTS+=" -Xms$SPARK_MEM -Xmx$SPARK_MEM" -# Load extra JAVA_OPTS from conf/java-opts, if it exists -if [ -e $FWDIR/conf/java-opts ] ; then - JAVA_OPTS+=" `cat $FWDIR/conf/java-opts`" -fi -export JAVA_OPTS - -# Build up classpath -CLASSPATH=":$FWDIR/conf" -for jar in `find $FWDIR -name '*jar'`; do - CLASSPATH+=":$jar" -done -export CLASSPATH - -exec java -Dscala.usejavacp=true -Djline.shutdownhook=true -cp "$CLASSPATH" $JAVA_OPTS $EXTRA_ARGS "$@" diff --git a/repl/pom.xml b/repl/pom.xml index 2dfe7ac900b83..346c672165d7d 100644 --- a/repl/pom.xml +++ b/repl/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 0.9.1-incubating-SNAPSHOT ../pom.xml diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala index 87d94d51be199..f262faa597b8f 100644 --- a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala +++ b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala @@ -180,8 +180,13 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter, /** Create a new interpreter. */ def createInterpreter() { - if (addedClasspath != "") - settings.classpath append addedClasspath + require(settings != null) + + if (addedClasspath != "") settings.classpath.append(addedClasspath) + // work around for Scala bug + val totalClassPath = SparkILoop.getAddedJars.foldLeft( + settings.classpath.value)((l, r) => ClassPath.join(l, r)) + this.settings.classpath.value = totalClassPath intp = new SparkILoopInterpreter } diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala b/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala index 21b1ba305d110..ab5e283d65f07 100644 --- a/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala +++ b/repl/src/main/scala/org/apache/spark/repl/SparkILoopInit.scala @@ -24,7 +24,7 @@ trait SparkILoopInit { ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ - /___/ .__/\_,_/_/ /_/\_\ version 0.9.0-SNAPSHOT + /___/ .__/\_,_/_/ /_/\_\ version 0.9.0 /_/ """) import Properties._ diff --git a/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala b/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala index 8aad27366524a..8203b8f6122e1 100644 --- a/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala +++ b/repl/src/test/scala/org/apache/spark/repl/ReplSuite.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.repl import java.io._ diff --git a/sbin/spark-daemon.sh b/sbin/spark-daemon.sh index 2be2b3d7c0933..5a21ebe8a4ce9 100755 --- a/sbin/spark-daemon.sh +++ b/sbin/spark-daemon.sh @@ -147,7 +147,7 @@ case $startStop in spark_rotate_log "$log" echo starting $command, logging to $log cd "$SPARK_PREFIX" - nohup nice -n $SPARK_NICENESS "$SPARK_PREFIX"/bin/spark-class $command "$@" >> "$log" 2>&1 < /dev/null & + nohup nice -n $SPARK_NICENESS ./bin/spark-class $command "$@" >> "$log" 2>&1 < /dev/null & newpid=$! echo $newpid > $pid sleep 2 diff --git a/streaming/pom.xml b/streaming/pom.xml index 459756912dbe5..7e10ef6f471be 100644 --- a/streaming/pom.xml +++ b/streaming/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 0.9.1-incubating-SNAPSHOT ../pom.xml diff --git a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala index 5046a1d53fa41..4d778dc4d43b4 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/Checkpoint.scala @@ -42,11 +42,13 @@ class Checkpoint(@transient ssc: StreamingContext, val checkpointTime: Time) val checkpointDuration = ssc.checkpointDuration val pendingTimes = ssc.scheduler.getPendingTimes().toArray val delaySeconds = MetadataCleaner.getDelaySeconds(ssc.conf) - val sparkConf = ssc.conf + val sparkConfPairs = ssc.conf.getAll - // These should be unset when a checkpoint is deserialized, - // otherwise the SparkContext won't initialize correctly. - sparkConf.remove("spark.driver.host").remove("spark.driver.port") + def sparkConf = { + new SparkConf(false).setAll(sparkConfPairs) + .remove("spark.driver.host") + .remove("spark.driver.port") + } def validate() { assert(master != null, "Checkpoint.master is null") diff --git a/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala b/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala index 1f5dacb543db8..86753360a07e4 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/ContextWaiter.scala @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.streaming private[streaming] class ContextWaiter { diff --git a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala index 8faa79f8c7e9d..0683113bd0b51 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/DStreamGraph.scala @@ -163,8 +163,10 @@ final private[streaming] class DStreamGraph extends Serializable with Logging { logDebug("DStreamGraph.writeObject used") this.synchronized { checkpointInProgress = true + logDebug("Enabled checkpoint mode") oos.defaultWriteObject() checkpointInProgress = false + logDebug("Disabled checkpoint mode") } } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala index 26257e652e537..5847b95e3f5d1 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala @@ -42,9 +42,15 @@ import org.apache.spark.streaming.scheduler._ import org.apache.hadoop.conf.Configuration /** - * A StreamingContext is the main entry point for Spark Streaming functionality. Besides the basic - * information (such as, cluster URL and job name) to internally create a SparkContext, it provides - * methods used to create DStream from various input sources. + * Main entry point for Spark Streaming functionality. It provides methods used to create + * [[org.apache.spark.streaming.dstream.DStream]]s from various input sources. It can be either + * created by providing a Spark master URL and an appName, or from a org.apache.spark.SparkConf + * configuration (see core Spark documentation), or from an existing org.apache.spark.SparkContext. + * The associated SparkContext can be accessed using `context.sparkContext`. After + * creating and transforming DStreams, the streaming computation can be started and stopped + * using `context.start()` and `context.stop()`, respectively. + * `context.awaitTransformation()` allows the current thread to wait for the termination + * of the context by `stop()` or by an exception. */ class StreamingContext private[streaming] ( sc_ : SparkContext, @@ -63,7 +69,7 @@ class StreamingContext private[streaming] ( /** * Create a StreamingContext by providing the configuration necessary for a new SparkContext. - * @param conf a [[org.apache.spark.SparkConf]] object specifying Spark parameters + * @param conf a org.apache.spark.SparkConf object specifying Spark parameters * @param batchDuration the time interval at which streaming data will be divided into batches */ def this(conf: SparkConf, batchDuration: Duration) = { @@ -88,7 +94,7 @@ class StreamingContext private[streaming] ( } /** - * Re-create a StreamingContext from a checkpoint file. + * Recreate a StreamingContext from a checkpoint file. * @param path Path to the directory that was specified as the checkpoint directory * @param hadoopConf Optional, configuration object if necessary for reading from * HDFS compatible filesystems @@ -151,6 +157,7 @@ class StreamingContext private[streaming] ( private[streaming] val scheduler = new JobScheduler(this) private[streaming] val waiter = new ContextWaiter + /** * Return the associated Spark context */ diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStream.scala index c92854ccd9a28..e23b725052864 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStream.scala @@ -27,22 +27,12 @@ import scala.reflect.ClassTag import org.apache.spark.streaming.dstream.DStream /** - * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous - * sequence of RDDs (of the same type) representing a continuous stream of data (see [[org.apache.spark.rdd.RDD]] - * for more details on RDDs). DStreams can either be created from live data (such as, data from - * HDFS, Kafka or Flume) or it can be generated by transformation existing DStreams using operations - * such as `map`, `window` and `reduceByKeyAndWindow`. While a Spark Streaming program is running, each - * DStream periodically generates a RDD, either from live data or by transforming the RDD generated - * by a parent DStream. - * - * This class contains the basic operations available on all DStreams, such as `map`, `filter` and - * `window`. In addition, [[org.apache.spark.streaming.api.java.JavaPairDStream]] contains operations available - * only on DStreams of key-value pairs, such as `groupByKeyAndWindow` and `join`. - * - * DStreams internally is characterized by a few basic properties: - * - A list of other DStreams that the DStream depends on - * - A time interval at which the DStream generates an RDD - * - A function that is used to generate an RDD after each time interval + * A Java-friendly interface to [[org.apache.spark.streaming.dstream.DStream]], the basic + * abstraction in Spark Streaming that represents a continuous stream of data. + * DStreams can either be created from live data (such as, data from TCP sockets, Kafka, Flume, + * etc.) or it can be generated by transforming existing DStreams using operations such as `map`, + * `window`. For operations applicable to key-value pair DStreams, see + * [[org.apache.spark.streaming.api.java.JavaPairDStream]]. */ class JavaDStream[T](val dstream: DStream[T])(implicit val classTag: ClassTag[T]) extends JavaDStreamLike[T, JavaDStream[T], JavaRDD[T]] { diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala index a493a8279f942..64fe204cdf7a5 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaDStreamLike.scala @@ -138,7 +138,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T /** Return a new DStream by applying a function to all elements of this DStream. */ def map[K2, V2](f: PairFunction[T, K2, V2]): JavaPairDStream[K2, V2] = { - def cm = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[Tuple2[K2, V2]]] + def cm = implicitly[ClassTag[Tuple2[_, _]]].asInstanceOf[ClassTag[Tuple2[K2, V2]]] new JavaPairDStream(dstream.map(f)(cm))(f.keyType(), f.valueType()) } @@ -159,7 +159,7 @@ trait JavaDStreamLike[T, This <: JavaDStreamLike[T, This, R], R <: JavaRDDLike[T def flatMap[K2, V2](f: PairFlatMapFunction[T, K2, V2]): JavaPairDStream[K2, V2] = { import scala.collection.JavaConverters._ def fn = (x: T) => f.apply(x).asScala - def cm = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[Tuple2[K2, V2]]] + def cm = implicitly[ClassTag[Tuple2[_, _]]].asInstanceOf[ClassTag[Tuple2[K2, V2]]] new JavaPairDStream(dstream.flatMap(fn)(cm))(f.keyType(), f.valueType()) } diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala index 6bb985ca540ff..62cfa0a229db1 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaPairDStream.scala @@ -37,6 +37,10 @@ import org.apache.spark.rdd.RDD import org.apache.spark.rdd.PairRDDFunctions import org.apache.spark.streaming.dstream.DStream +/** + * A Java-friendly interface to a DStream of key-value pairs, which provides extra methods + * like `reduceByKey` and `join`. + */ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])( implicit val kManifest: ClassTag[K], implicit val vManifest: ClassTag[V]) @@ -741,7 +745,7 @@ class JavaPairDStream[K, V](val dstream: DStream[(K, V)])( } override val classTag: ClassTag[(K, V)] = - implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[Tuple2[K, V]]] + implicitly[ClassTag[Tuple2[_, _]]].asInstanceOf[ClassTag[Tuple2[K, V]]] } object JavaPairDStream { diff --git a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala index 4edf8fa13a205..921b56143af25 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala @@ -22,7 +22,6 @@ import scala.collection.JavaConversions._ import scala.reflect.ClassTag import java.io.InputStream -import java.lang.{Integer => JInt} import java.util.{List => JList, Map => JMap} import akka.actor.{Props, SupervisorStrategy} @@ -39,19 +38,20 @@ import org.apache.hadoop.conf.Configuration import org.apache.spark.streaming.dstream.DStream /** - * A StreamingContext is the main entry point for Spark Streaming functionality. Besides the basic - * information (such as, cluster URL and job name) to internally create a SparkContext, it provides - * methods used to create DStream from various input sources. + * A Java-friendly version of [[org.apache.spark.streaming.StreamingContext]] which is the main + * entry point for Spark Streaming functionality. It provides methods to create + * [[org.apache.spark.streaming.api.java.JavaDStream]] and + * [[org.apache.spark.streaming.api.java.JavaPairDStream.]] from input sources. The internal + * org.apache.spark.api.java.JavaSparkContext (see core Spark documentation) can be accessed + * using `context.sparkContext`. After creating and transforming DStreams, the streaming + * computation can be started and stopped using `context.start()` and `context.stop()`, + * respectively. `context.awaitTransformation()` allows the current thread to wait for the + * termination of a context by `stop()` or by an exception. */ class JavaStreamingContext(val ssc: StreamingContext) { - // TODOs: - // - Test to/from Hadoop functions - // - Support creating and registering InputStreams - - /** - * Creates a StreamingContext. + * Create a StreamingContext. * @param master Name of the Spark Master * @param appName Name to be used when registering with the scheduler * @param batchDuration The time interval at which streaming data will be divided into batches @@ -60,7 +60,7 @@ class JavaStreamingContext(val ssc: StreamingContext) { this(new StreamingContext(master, appName, batchDuration, null, Nil, Map())) /** - * Creates a StreamingContext. + * Create a StreamingContext. * @param master Name of the Spark Master * @param appName Name to be used when registering with the scheduler * @param batchDuration The time interval at which streaming data will be divided into batches @@ -77,7 +77,7 @@ class JavaStreamingContext(val ssc: StreamingContext) { this(new StreamingContext(master, appName, batchDuration, sparkHome, Seq(jarFile), Map())) /** - * Creates a StreamingContext. + * Create a StreamingContext. * @param master Name of the Spark Master * @param appName Name to be used when registering with the scheduler * @param batchDuration The time interval at which streaming data will be divided into batches @@ -94,7 +94,7 @@ class JavaStreamingContext(val ssc: StreamingContext) { this(new StreamingContext(master, appName, batchDuration, sparkHome, jars, Map())) /** - * Creates a StreamingContext. + * Create a StreamingContext. * @param master Name of the Spark Master * @param appName Name to be used when registering with the scheduler * @param batchDuration The time interval at which streaming data will be divided into batches @@ -113,7 +113,7 @@ class JavaStreamingContext(val ssc: StreamingContext) { this(new StreamingContext(master, appName, batchDuration, sparkHome, jars, environment)) /** - * Creates a StreamingContext using an existing SparkContext. + * Create a JavaStreamingContext using an existing JavaSparkContext. * @param sparkContext The underlying JavaSparkContext to use * @param batchDuration The time interval at which streaming data will be divided into batches */ @@ -121,7 +121,7 @@ class JavaStreamingContext(val ssc: StreamingContext) { this(new StreamingContext(sparkContext.sc, batchDuration)) /** - * Creates a StreamingContext using an existing SparkContext. + * Create a JavaStreamingContext using a SparkConf configuration. * @param conf A Spark application configuration * @param batchDuration The time interval at which streaming data will be divided into batches */ @@ -129,23 +129,26 @@ class JavaStreamingContext(val ssc: StreamingContext) { this(new StreamingContext(conf, batchDuration)) /** - * Re-creates a StreamingContext from a checkpoint file. + * Recreate a JavaStreamingContext from a checkpoint file. * @param path Path to the directory that was specified as the checkpoint directory */ def this(path: String) = this(new StreamingContext(path, new Configuration)) /** - * Re-creates a StreamingContext from a checkpoint file. + * Re-creates a JavaStreamingContext from a checkpoint file. * @param path Path to the directory that was specified as the checkpoint directory * */ def this(path: String, hadoopConf: Configuration) = this(new StreamingContext(path, hadoopConf)) + @deprecated("use sparkContext", "0.9.0") + val sc: JavaSparkContext = sparkContext + /** The underlying SparkContext */ - val sc: JavaSparkContext = new JavaSparkContext(ssc.sc) + val sparkContext = new JavaSparkContext(ssc.sc) /** - * Create a input stream from network source hostname:port. Data is received using + * Create an input stream from network source hostname:port. Data is received using * a TCP socket and the receive bytes is interpreted as UTF8 encoded \n delimited * lines. * @param hostname Hostname to connect to for receiving data @@ -158,7 +161,7 @@ class JavaStreamingContext(val ssc: StreamingContext) { } /** - * Create a input stream from network source hostname:port. Data is received using + * Create an input stream from network source hostname:port. Data is received using * a TCP socket and the receive bytes is interpreted as UTF8 encoded \n delimited * lines. Storage level of the data will be the default StorageLevel.MEMORY_AND_DISK_SER_2. * @param hostname Hostname to connect to for receiving data @@ -169,7 +172,7 @@ class JavaStreamingContext(val ssc: StreamingContext) { } /** - * Create a input stream from network source hostname:port. Data is received using + * Create an input stream from network source hostname:port. Data is received using * a TCP socket and the receive bytes it interepreted as object using the given * converter. * @param hostname Hostname to connect to for receiving data @@ -191,7 +194,7 @@ class JavaStreamingContext(val ssc: StreamingContext) { } /** - * Create a input stream that monitors a Hadoop-compatible filesystem + * Create an input stream that monitors a Hadoop-compatible filesystem * for new files and reads them as text files (using key as LongWritable, value * as Text and input format as TextInputFormat). Files must be written to the * monitored directory by "moving" them from another location within the same @@ -203,7 +206,7 @@ class JavaStreamingContext(val ssc: StreamingContext) { } /** - * Create a input stream from network source hostname:port, where data is received + * Create an input stream from network source hostname:port, where data is received * as serialized blocks (serialized using the Spark's serializer) that can be directly * pushed into the block manager without deserializing them. This is the most efficient * way to receive data. @@ -222,7 +225,7 @@ class JavaStreamingContext(val ssc: StreamingContext) { } /** - * Create a input stream from network source hostname:port, where data is received + * Create an input stream from network source hostname:port, where data is received * as serialized blocks (serialized using the Spark's serializer) that can be directly * pushed into the block manager without deserializing them. This is the most efficient * way to receive data. @@ -237,7 +240,7 @@ class JavaStreamingContext(val ssc: StreamingContext) { } /** - * Create a input stream that monitors a Hadoop-compatible filesystem + * Create an input stream that monitors a Hadoop-compatible filesystem * for new files and reads them using the given key-value types and input format. * Files must be written to the monitored directory by "moving" them from another * location within the same file system. File names starting with . are ignored. @@ -320,7 +323,7 @@ class JavaStreamingContext(val ssc: StreamingContext) { } /** - * Creates a input stream from an queue of RDDs. In each batch, + * Creates an input stream from an queue of RDDs. In each batch, * it will process either one or all of the RDDs returned by the queue. * * NOTE: changes to the queue after the stream is created will not be recognized. @@ -336,7 +339,7 @@ class JavaStreamingContext(val ssc: StreamingContext) { } /** - * Creates a input stream from an queue of RDDs. In each batch, + * Creates an input stream from an queue of RDDs. In each batch, * it will process either one or all of the RDDs returned by the queue. * * NOTE: changes to the queue after the stream is created will not be recognized. @@ -353,7 +356,7 @@ class JavaStreamingContext(val ssc: StreamingContext) { } /** - * Creates a input stream from an queue of RDDs. In each batch, + * Creates an input stream from an queue of RDDs. In each batch, * it will process either one or all of the RDDs returned by the queue. * * NOTE: changes to the queue after the stream is created will not be recognized. diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala index 71a4c5c93e76a..6bff56a9d332a 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStream.scala @@ -37,8 +37,9 @@ import org.apache.spark.streaming.Duration * A Discretized Stream (DStream), the basic abstraction in Spark Streaming, is a continuous * sequence of RDDs (of the same type) representing a continuous stream of data (see * org.apache.spark.rdd.RDD in the Spark core documentation for more details on RDDs). - * DStreams can either be created from live data (such as, data from Kafka, Flume, sockets, HDFS) - * or it can be generated by transforming existing DStreams using operations such as `map`, + * DStreams can either be created from live data (such as, data from TCP sockets, Kafka, Flume, + * etc.) using a [[org.apache.spark.streaming.StreamingContext]] or it can be generated by + * transforming existing DStreams using operations such as `map`, * `window` and `reduceByKeyAndWindow`. While a Spark Streaming program is running, each DStream * periodically generates a RDD, either from live data or by transforming the RDD generated by a * parent DStream. @@ -540,7 +541,6 @@ abstract class DStream[T: ClassTag] ( * on each RDD of 'this' DStream. */ def transform[U: ClassTag](transformFunc: (RDD[T], Time) => RDD[U]): DStream[U] = { - //new TransformedDStream(this, context.sparkContext.clean(transformFunc)) val cleanedF = context.sparkContext.clean(transformFunc) val realTransformFunc = (rdds: Seq[RDD[_]], time: Time) => { assert(rdds.length == 1) diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala index 38bad5ac8042a..906a16e508cd8 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/DStreamCheckpointData.scala @@ -19,7 +19,7 @@ package org.apache.spark.streaming.dstream import scala.collection.mutable.HashMap import scala.reflect.ClassTag -import java.io.{ObjectInputStream, IOException} +import java.io.{ObjectOutputStream, ObjectInputStream, IOException} import org.apache.hadoop.fs.Path import org.apache.hadoop.fs.FileSystem import org.apache.spark.Logging @@ -117,8 +117,32 @@ class DStreamCheckpointData[T: ClassTag] (dstream: DStream[T]) "[\n" + currentCheckpointFiles.size + " checkpoint files \n" + currentCheckpointFiles.mkString("\n") + "\n]" } + @throws(classOf[IOException]) + private def writeObject(oos: ObjectOutputStream) { + logDebug(this.getClass().getSimpleName + ".writeObject used") + if (dstream.context.graph != null) { + dstream.context.graph.synchronized { + if (dstream.context.graph.checkpointInProgress) { + oos.defaultWriteObject() + } else { + val msg = "Object of " + this.getClass.getName + " is being serialized " + + " possibly as a part of closure of an RDD operation. This is because " + + " the DStream object is being referred to from within the closure. " + + " Please rewrite the RDD operation inside this DStream to avoid this. " + + " This has been enforced to avoid bloating of Spark tasks " + + " with unnecessary objects." + throw new java.io.NotSerializableException(msg) + } + } + } else { + throw new java.io.NotSerializableException( + "Graph is unexpectedly null when DStream is being serialized.") + } + } + @throws(classOf[IOException]) private def readObject(ois: ObjectInputStream) { + logDebug(this.getClass().getSimpleName + ".readObject used") ois.defaultReadObject() timeToOldestCheckpointFileTime = new HashMap[Time, Time] timeToCheckpointFile = new HashMap[Time, String] diff --git a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala index f57762321c40e..fb9df2f48eae3 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/dstream/PairDStreamFunctions.scala @@ -18,20 +18,17 @@ package org.apache.spark.streaming.dstream import org.apache.spark.streaming.StreamingContext._ -import org.apache.spark.streaming.dstream._ import org.apache.spark.{Partitioner, HashPartitioner} import org.apache.spark.SparkContext._ -import org.apache.spark.rdd.{ClassTags, RDD, PairRDDFunctions} -import org.apache.spark.storage.StorageLevel +import org.apache.spark.rdd.RDD import scala.collection.mutable.ArrayBuffer -import scala.reflect.{ClassTag, classTag} +import scala.reflect.ClassTag -import org.apache.hadoop.mapred.{JobConf, OutputFormat} +import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapreduce.{OutputFormat => NewOutputFormat} import org.apache.hadoop.mapred.OutputFormat -import org.apache.hadoop.security.UserGroupInformation import org.apache.hadoop.conf.Configuration import org.apache.spark.streaming.{Time, Duration} @@ -108,7 +105,7 @@ extends Serializable { /** * Combine elements of each key in DStream's RDDs using custom functions. This is similar to the * combineByKey for RDDs. Please refer to combineByKey in - * [[org.apache.spark.rdd.PairRDDFunctions]] for more information. + * org.apache.spark.rdd.PairRDDFunctions in the Spark core documentation for more information. */ def combineByKey[C: ClassTag]( createCombiner: V => C, diff --git a/streaming/src/main/scala/org/apache/spark/streaming/receivers/ActorReceiver.scala b/streaming/src/main/scala/org/apache/spark/streaming/receivers/ActorReceiver.scala index fdf5371a89587..79ed696814f07 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/receivers/ActorReceiver.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/receivers/ActorReceiver.scala @@ -44,40 +44,49 @@ object ReceiverSupervisorStrategy { /** * A receiver trait to be mixed in with your Actor to gain access to - * pushBlock API. + * the API for pushing received data into Spark Streaming for being processed. * * Find more details at: http://spark-project.org/docs/latest/streaming-custom-receivers.html * * @example {{{ * class MyActor extends Actor with Receiver{ * def receive { - * case anything :String => pushBlock(anything) + * case anything: String => pushBlock(anything) * } * } - * //Can be plugged in actorStream as follows + * + * // Can be used with an actorStream as follows * ssc.actorStream[String](Props(new MyActor),"MyActorReceiver") * * }}} * - * @note An important point to note: - * Since Actor may exist outside the spark framework, It is thus user's responsibility + * @note Since Actor may exist outside the spark framework, It is thus user's responsibility * to ensure the type safety, i.e parametrized type of push block and InputDStream * should be same. - * */ -trait Receiver { self: Actor ā‡’ +trait Receiver { + + self: Actor ā‡’ // to ensure that this can be added to Actor classes only + + /** + * Push an iterator received data into Spark Streaming for processing + */ def pushBlock[T: ClassTag](iter: Iterator[T]) { context.parent ! Data(iter) } + /** + * Push a single item of received data into Spark Streaming for processing + */ def pushBlock[T: ClassTag](data: T) { context.parent ! Data(data) } - } /** - * Statistics for querying the supervisor about state of workers + * Statistics for querying the supervisor about state of workers. Used in + * conjunction with `StreamingContext.actorStream` and + * [[org.apache.spark.streaming.receivers.Receiver]]. */ case class Statistics(numberOfMsgs: Int, numberOfWorkers: Int, @@ -96,17 +105,15 @@ private[streaming] case class Data[T: ClassTag](data: T) * his own Actor to run as receiver for Spark Streaming input source. * * This starts a supervisor actor which starts workers and also provides - * [http://doc.akka.io/docs/akka/2.0.5/scala/fault-tolerance.html fault-tolerance]. + * [http://doc.akka.io/docs/akka/snapshot/scala/fault-tolerance.html fault-tolerance]. * - * Here's a way to start more supervisor/workers as its children. + * Here's a way to start more supervisor/workers as its children. * * @example {{{ * context.parent ! Props(new Supervisor) * }}} OR {{{ - * context.parent ! Props(new Worker,"Worker") + * context.parent ! Props(new Worker, "Worker") * }}} - * - * */ private[streaming] class ActorReceiver[T: ClassTag]( props: Props, diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RateLimitedOutputStream.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RateLimitedOutputStream.scala index b9c0596378b4f..179fd7593982c 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/RateLimitedOutputStream.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RateLimitedOutputStream.scala @@ -22,6 +22,7 @@ import scala.annotation.tailrec import java.io.OutputStream import java.util.concurrent.TimeUnit._ +private[streaming] class RateLimitedOutputStream(out: OutputStream, bytesPerSec: Int) extends OutputStream { val SYNC_INTERVAL = NANOSECONDS.convert(10, SECONDS) val CHUNK_SIZE = 8192 diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala index 5b6c048a39620..07021ebb5802a 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextHelper.scala @@ -22,6 +22,7 @@ import org.apache.spark.SparkContext._ import it.unimi.dsi.fastutil.objects.{Object2LongOpenHashMap => OLMap} import scala.collection.JavaConversions.mapAsScalaMap +private[streaming] object RawTextHelper { /** diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala index 463617a713b22..684b38e8b3102 100644 --- a/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala +++ b/streaming/src/main/scala/org/apache/spark/streaming/util/RawTextSender.scala @@ -33,6 +33,7 @@ import org.apache.spark.util.IntParam * A helper program that sends blocks of Kryo-serialized text strings out on a socket at a * specified rate. Used to feed data into RawInputDStream. */ +private[streaming] object RawTextSender extends Logging { def main(args: Array[String]) { if (args.length != 4) { diff --git a/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java b/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java index 8b7d7709bf2c5..4fbbce9b8b90e 100644 --- a/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java +++ b/streaming/src/test/java/org/apache/spark/streaming/JavaAPISuite.java @@ -297,9 +297,9 @@ public void testQueueStream() { Arrays.asList(7,8,9)); JavaSparkContext jsc = new JavaSparkContext(ssc.ssc().sc()); - JavaRDD rdd1 = ssc.sc().parallelize(Arrays.asList(1, 2, 3)); - JavaRDD rdd2 = ssc.sc().parallelize(Arrays.asList(4, 5, 6)); - JavaRDD rdd3 = ssc.sc().parallelize(Arrays.asList(7,8,9)); + JavaRDD rdd1 = ssc.sparkContext().parallelize(Arrays.asList(1, 2, 3)); + JavaRDD rdd2 = ssc.sparkContext().parallelize(Arrays.asList(4, 5, 6)); + JavaRDD rdd3 = ssc.sparkContext().parallelize(Arrays.asList(7,8,9)); LinkedList> rdds = Lists.newLinkedList(); rdds.add(rdd1); diff --git a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala index 89daf4758661b..831e7c1471a09 100644 --- a/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala +++ b/streaming/src/test/scala/org/apache/spark/streaming/CheckpointSuite.scala @@ -151,17 +151,29 @@ class CheckpointSuite extends TestSuiteBase { val value = "myvalue" System.setProperty(key, value) ssc = new StreamingContext(master, framework, batchDuration) + val originalConf = ssc.conf + val cp = new Checkpoint(ssc, Time(1000)) - assert(!cp.sparkConf.contains("spark.driver.host")) - assert(!cp.sparkConf.contains("spark.driver.port")) - assert(!cp.sparkConf.contains("spark.hostPort")) - assert(cp.sparkConf.get(key) === value) + val cpConf = cp.sparkConf + assert(cpConf.get("spark.master") === originalConf.get("spark.master")) + assert(cpConf.get("spark.app.name") === originalConf.get("spark.app.name")) + assert(cpConf.get(key) === value) ssc.stop() + + // Serialize/deserialize to simulate write to storage and reading it back val newCp = Utils.deserialize[Checkpoint](Utils.serialize(cp)) - assert(!newCp.sparkConf.contains("spark.driver.host")) - assert(!newCp.sparkConf.contains("spark.driver.port")) - assert(!newCp.sparkConf.contains("spark.hostPort")) - assert(newCp.sparkConf.get(key) === value) + + val newCpConf = newCp.sparkConf + assert(newCpConf.get("spark.master") === originalConf.get("spark.master")) + assert(newCpConf.get("spark.app.name") === originalConf.get("spark.app.name")) + assert(newCpConf.get(key) === value) + assert(!newCpConf.contains("spark.driver.host")) + assert(!newCpConf.contains("spark.driver.port")) + + // Check if all the parameters have been restored + ssc = new StreamingContext(null, newCp, null) + val restoredConf = ssc.conf + assert(restoredConf.get(key) === value) } diff --git a/tools/pom.xml b/tools/pom.xml index 28f5ef14b1a35..bb8f747ae9328 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 0.9.1-incubating-SNAPSHOT ../pom.xml diff --git a/yarn/alpha/pom.xml b/yarn/alpha/pom.xml index 8291e9e7a36ce..349c8358ecf90 100644 --- a/yarn/alpha/pom.xml +++ b/yarn/alpha/pom.xml @@ -20,7 +20,7 @@ org.apache.spark yarn-parent_2.10 - 0.9.0-incubating-SNAPSHOT + 0.9.0-incubating ../pom.xml diff --git a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala index 9fe4d64a0fca0..138c27910b0b0 100644 --- a/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala +++ b/yarn/alpha/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala @@ -210,7 +210,7 @@ class WorkerLauncher(args: ApplicationMasterArguments, conf: Configuration, spar // Wait until all containers have finished // TODO: This is a bit ugly. Can we make it nicer? // TODO: Handle container failure - while(yarnAllocator.getNumWorkersRunning < args.numWorkers) { + while ((yarnAllocator.getNumWorkersRunning < args.numWorkers) && (!driverClosed)) { yarnAllocator.allocateContainers(math.max(args.numWorkers - yarnAllocator.getNumWorkersRunning, 0)) Thread.sleep(100) } diff --git a/yarn/pom.xml b/yarn/pom.xml index aea8b0cddefa2..508317b5fc01c 100644 --- a/yarn/pom.xml +++ b/yarn/pom.xml @@ -20,7 +20,7 @@ org.apache.spark spark-parent - 0.9.0-incubating-SNAPSHOT + 0.9.1-incubating-SNAPSHOT ../pom.xml diff --git a/yarn/stable/pom.xml b/yarn/stable/pom.xml index 62fe3e274250f..04b29c76e5830 100644 --- a/yarn/stable/pom.xml +++ b/yarn/stable/pom.xml @@ -20,7 +20,7 @@ org.apache.spark yarn-parent_2.10 - 0.9.0-incubating-SNAPSHOT + 0.9.1-incubating-SNAPSHOT ../pom.xml diff --git a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala index 78353224fa4b8..40600f38e5e73 100644 --- a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala +++ b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/WorkerLauncher.scala @@ -193,7 +193,7 @@ class WorkerLauncher(args: ApplicationMasterArguments, conf: Configuration, spar // TODO: Handle container failure yarnAllocator.addResourceRequests(args.numWorkers) - while (yarnAllocator.getNumWorkersRunning < args.numWorkers) { + while ((yarnAllocator.getNumWorkersRunning < args.numWorkers) && (!driverClosed)) { yarnAllocator.allocateResources() Thread.sleep(100) } diff --git a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala index 738ff986d85a5..1ac61124cb028 100644 --- a/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala +++ b/yarn/stable/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocationHandler.scala @@ -532,15 +532,15 @@ private[yarn] class YarnAllocationHandler( priority: Int ): ArrayBuffer[ContainerRequest] = { - val memoryResource = Records.newRecord(classOf[Resource]) - memoryResource.setMemory(workerMemory + YarnAllocationHandler.MEMORY_OVERHEAD) + val memoryRequest = workerMemory + YarnAllocationHandler.MEMORY_OVERHEAD + val resource = Resource.newInstance(memoryRequest, workerCores) val prioritySetting = Records.newRecord(classOf[Priority]) prioritySetting.setPriority(priority) val requests = new ArrayBuffer[ContainerRequest]() for (i <- 0 until numWorkers) { - requests += new ContainerRequest(memoryResource, hosts, racks, prioritySetting) + requests += new ContainerRequest(resource, hosts, racks, prioritySetting) } requests }