From b042d168a8cb5e2346821cd81de5779bcb55b1bd Mon Sep 17 00:00:00 2001 From: Olivier Blanvillain Date: Fri, 19 May 2017 08:45:29 +0200 Subject: [PATCH 1/8] Copy README to docs via sbt --- build.sbt | 8 ++++++++ scripts/docs-build.sh | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/build.sbt b/build.sbt index 6de5d5ff..af5e0e22 100644 --- a/build.sbt +++ b/build.sbt @@ -184,3 +184,11 @@ lazy val credentialSettings = Seq( password <- Option(System.getenv().get("SONATYPE_PASSWORD")) } yield Credentials("Sonatype Nexus Repository Manager", "oss.sonatype.org", username, password)).toSeq ) + +copyReadme := copyReadmeImpl.value +lazy val copyReadme = taskKey[Unit]("copy for website generation") +lazy val copyReadmeImpl = Def.task { + val from = baseDirectory.value / "README.md" + val to = baseDirectory.value / "docs" / "src" / "main" / "tut" / "README.md" + sbt.IO.copy(List((from, to)), overwrite = true, preserveLastModified = true) +} diff --git a/scripts/docs-build.sh b/scripts/docs-build.sh index 51ece68a..1b243684 100644 --- a/scripts/docs-build.sh +++ b/scripts/docs-build.sh @@ -2,7 +2,7 @@ set -eux -sbt tut +sbt copyReadme tut gitbook="node_modules/gitbook-cli/bin/gitbook.js" From 6c95348eed3abd67bacdfc73538ee1f752d578ef Mon Sep 17 00:00:00 2001 From: Olivier Blanvillain Date: Fri, 19 May 2017 08:45:51 +0200 Subject: [PATCH 2/8] Fix link to FeatureOverview.md --- docs/src/main/tut/README.md | 0 docs/src/main/tut/SUMMARY.md | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) delete mode 100644 docs/src/main/tut/README.md diff --git a/docs/src/main/tut/README.md b/docs/src/main/tut/README.md deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/src/main/tut/SUMMARY.md b/docs/src/main/tut/SUMMARY.md index 3737f6d3..3bea0d79 100644 --- a/docs/src/main/tut/SUMMARY.md +++ b/docs/src/main/tut/SUMMARY.md @@ -1,7 +1,7 @@ -- [TypedDataset: Feature Overview](GettingStarted.md) +- [TypedDataset: Feature Overview](FeatureOverview.md) - [Comparing TypedDatasets with Spark's Datasets](TypedDatasetVsSparkDataset.md) - [Typed Encoders in Frameless](TypedEncoder.md) - [Injection: Creating Custom Encoders](Injection.md) - [Job\[A\]](Job.md) - [Using Cats with RDDs](Cats.md) -- [Proof of Concept: TypedDataFrame](TypedDataFrame.md) \ No newline at end of file +- [Proof of Concept: TypedDataFrame](TypedDataFrame.md) From 8ba57bde7182e03957f7957712f415d3ccdeaff6 Mon Sep 17 00:00:00 2001 From: Olivier Blanvillain Date: Fri, 19 May 2017 09:09:00 +0200 Subject: [PATCH 3/8] Remove tailing spaces in Injection.md --- docs/src/main/tut/Injection.md | 43 +++++++++++++++++----------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/docs/src/main/tut/Injection.md b/docs/src/main/tut/Injection.md index b02f65e2..9e323383 100644 --- a/docs/src/main/tut/Injection.md +++ b/docs/src/main/tut/Injection.md @@ -1,4 +1,5 @@ # Injection: Creating Custom Encoders + ```tut:invisible import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.SparkSession @@ -11,33 +12,33 @@ implicit val sqlContext = spark.sqlContext spark.sparkContext.setLogLevel("WARN") import spark.implicits._ -``` +``` Injection lets us define encoders for types that do not have one by injecting `A` into an encodable type `B`. -This is the definition of the injection typeclass: +This is the definition of the injection typeclass: ```scala trait Injection[A, B] extends Serializable { def apply(a: A): B def invert(b: B): A } -``` +``` ## Example -Let's define a simple case class: +Let's define a simple case class: ```tut:book case class Person(age: Int, birthday: java.util.Date) val people = Seq(Person(42, new java.util.Date)) -``` +``` And an instance of a `TypedDataset`: ```tut:book:fail val personDS = TypedDataset.create(people) -``` +``` -Looks like we can't, a `TypedEncoder` instance of `Person` is not available, or more precisely for `java.util.Date`. -But we can define a injection from `java.util.Date` to an encodable type, like `Long`: +Looks like we can't, a `TypedEncoder` instance of `Person` is not available, or more precisely for `java.util.Date`. +But we can define a injection from `java.util.Date` to an encodable type, like `Long`: ```tut:book import frameless._ @@ -45,7 +46,7 @@ implicit val dateToLongInjection = new Injection[java.util.Date, Long] { def apply(d: java.util.Date): Long = d.getTime() def invert(l: Long): java.util.Date = new java.util.Date(l) } -``` +``` We can be less verbose using the `Injection.apply` function: @@ -54,37 +55,37 @@ import frameless._ implicit val dateToLongInjection = Injection((_: java.util.Date).getTime(), new java.util.Date((_: Long))) ``` -Now we can create our `TypedDataset`: +Now we can create our `TypedDataset`: ```tut:book val personDS = TypedDataset.create(people) -``` +``` ## Another example -Let's define a sealed family: +Let's define a sealed family: ```tut:book sealed trait Gender case object Male extends Gender case object Female extends Gender case object Other extends Gender -``` +``` -And a simple case class: +And a simple case class: ```tut:book case class Person(age: Int, gender: Gender) val people = Seq(Person(42, Male)) -``` +``` Again if we try to create a `TypedDataset`, we get a compilation error. ```tut:book:fail val personDS = TypedDataset.create(people) -``` +``` -Let's define an injection instance for `Gender`: +Let's define an injection instance for `Gender`: ```tut:book implicit val genderToInt: Injection[Gender, Int] = Injection( @@ -98,14 +99,14 @@ implicit val genderToInt: Injection[Gender, Int] = Injection( case 2 => Female case 3 => Other }) -``` +``` -And now we can create our `TypedDataset`: +And now we can create our `TypedDataset`: ```tut:book val personDS = TypedDataset.create(people) -``` +``` ```tut:invisible spark.stop() -``` +``` From 26a0180da9d7c9f2670cb0c367d1e42b372a0b97 Mon Sep 17 00:00:00 2001 From: Olivier Blanvillain Date: Mon, 22 May 2017 09:46:38 +0200 Subject: [PATCH 4/8] Fix typos in Job.md --- docs/src/main/tut/Job.md | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/docs/src/main/tut/Job.md b/docs/src/main/tut/Job.md index 155c821b..44e4a080 100644 --- a/docs/src/main/tut/Job.md +++ b/docs/src/main/tut/Job.md @@ -1,19 +1,19 @@ # Job\[A\] -All operations on `TypedDataset` are lazy. An operation either returns a new +All operations on `TypedDataset` are lazy. An operation either returns a new transformed `TypedDataset` or a `Job[A]`, where `A` is the result of running a -non-lazy computation in Spark. `Job` serves several functions: +non-lazy computation in Spark. `Job` serves several functions: - Makes all operations on a `TypedDataset` lazy, which makes them more predictable compared to having -few operations being lazy and other being strict: +few operations being lazy and other being strict - Allows the programmer to make expensive blocking operations explicit - Allows for Spark jobs to be lazily sequenced using monadic composition via for-comprehension - Provides an obvious place where you can annotate/name your Spark jobs to make it easier to track different parts of your application in the Spark UI -The toy example showcases the use of for-comprehension to explicitly sequences Spark Jobs. +The toy example showcases the use of for-comprehension to explicitly sequences Spark Jobs. First we calculate the size of the `TypedDataset` and then we collect to the driver -exactly 20% of its elements: +exactly 20% of its elements: ```tut:invisible import org.apache.spark.{SparkConf, SparkContext} @@ -32,28 +32,28 @@ import spark.implicits._ ```tut:book val ds = TypedDataset.create(1 to 20) -val countAndTakeJob = +val countAndTakeJob = for { - count <- ds.count() + count <- ds.count() sample <- ds.take((count/5).toInt) } yield sample countAndTakeJob.run() ``` -The `countAndTakeJob` can either be executed using `run()` (as we show above) or it can +The `countAndTakeJob` can either be executed using `run()` (as we show above) or it can be passed along to other parts of the program to be further composed into more complex sequences -of Spark jobs. +of Spark jobs. ```tut:book import frameless.Job def computeMinOfSample(sample: Job[Seq[Int]]): Job[Int] = sample.map(_.min) -val finalJob = computeMinOfSample(countAndTakeJob) +val finalJob = computeMinOfSample(countAndTakeJob) ``` -Now we can execute this new job by specifying a [group-id](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.SparkContext@setJobGroup(groupId:String,description:String,interruptOnCancel:Boolean):Unit) and a description. -This allows the programmer to see this information on the Spark UI and help track, say, +Now we can execute this new job by specifying a [group-id][group-id] and a description. +This allows the programmer to see this information on the Spark UI and help track, say, performance issues. ```tut:book @@ -66,4 +66,6 @@ finalJob. ```tut:invisible spark.stop() -``` \ No newline at end of file +``` + +[group-id]: https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.SparkContext@setJobGroup(groupId:String,description:String,interruptOnCancel:Boolean):Unit From 4a1b17c5e3bd2f4ea601b28fde0c3348c8afea81 Mon Sep 17 00:00:00 2001 From: Olivier Blanvillain Date: Mon, 22 May 2017 09:46:58 +0200 Subject: [PATCH 5/8] Update doc links in README --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index ba22f352..adf34382 100644 --- a/README.md +++ b/README.md @@ -18,12 +18,12 @@ associated channels (e.g. GitHub, Gitter) to be a safe and friendly environment ## Documentation -* [TypedDataset: Feature Overview](http://olivierblanvillain.github.io/frameless/GettingStarted.html) -* [Comparing TypedDatasets with Spark's Datasets](http://olivierblanvillain.github.io/frameless/TypedDatasetVsSparkDataset.html) -* [Typed Encoders in Frameless](http://olivierblanvillain.github.io/frameless/TypedEncoder.html) -* [Injection: Creating Custom Encoders](http://olivierblanvillain.github.io/frameless/Injection.html) -* [Using Cats with RDDs](http://olivierblanvillain.github.io/frameless/Cats.html) -* [Proof of Concept: TypedDataFrame](http://olivierblanvillain.github.io/frameless/TypedDataFrame.html) +* [TypedDataset: Feature Overview](http://typelevel.github.io/frameless/docs/book/FeatureOverview.html) +* [Comparing TypedDatasets with Spark's Datasets](http://typelevel.github.io/frameless/docs/book/TypedDatasetVsSparkDataset.html) +* [Typed Encoders in Frameless](http://typelevel.github.io/frameless/docs/book/TypedEncoder.html) +* [Injection: Creating Custom Encoders](http://typelevel.github.io/frameless/docs/book/Injection.html) +* [Using Cats with RDDs](http://typelevel.github.io/frameless/docs/book/Cats.html) +* [Proof of Concept: TypedDataFrame](http://typelevel.github.io/frameless/docs/book/TypedDataFrame.html) ## Why? From 1b4d2d687eeb5f150ae4723c3a2514070f126409 Mon Sep 17 00:00:00 2001 From: Olivier Blanvillain Date: Mon, 22 May 2017 10:04:00 +0200 Subject: [PATCH 6/8] Update docs-publish.sh script --- scripts/docs-publish.sh | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/scripts/docs-publish.sh b/scripts/docs-publish.sh index 2da4bd29..be6cc66d 100644 --- a/scripts/docs-publish.sh +++ b/scripts/docs-publish.sh @@ -2,14 +2,24 @@ set -eux +# Check that the working directory is a git repository and the repository has no outstanding changes. +git diff-index --quiet HEAD + +commit=$(git show -s --format=%h) + git checkout gh-pages -git checkout master . +git merge "$commit" bash scripts/docs-build.sh git add . -git commit -am "Update book" +git commit -am "Rebuild documentation ($commit)" -echo "git push" +echo "Verify that you didn't break anything:" +echo " $ python -m SimpleHTTPServer 8000" +echo " $ xdg-open http://localhost:8000/docs/book/" +echo "" +echo "Then push to the gh-pages branch:" +echo " $ git push gh-pages" From 5c3635d2c50662d91f72ef828a5e53d9cba651ad Mon Sep 17 00:00:00 2001 From: Olivier Blanvillain Date: Tue, 23 May 2017 10:53:34 +0200 Subject: [PATCH 7/8] Add Job to the README index --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index adf34382..518a8504 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ associated channels (e.g. GitHub, Gitter) to be a safe and friendly environment * [Comparing TypedDatasets with Spark's Datasets](http://typelevel.github.io/frameless/docs/book/TypedDatasetVsSparkDataset.html) * [Typed Encoders in Frameless](http://typelevel.github.io/frameless/docs/book/TypedEncoder.html) * [Injection: Creating Custom Encoders](http://typelevel.github.io/frameless/docs/book/Injection.html) +* [Job\[A\]](http://typelevel.github.io/frameless/docs/book/Job.htmlé) * [Using Cats with RDDs](http://typelevel.github.io/frameless/docs/book/Cats.html) * [Proof of Concept: TypedDataFrame](http://typelevel.github.io/frameless/docs/book/TypedDataFrame.html) From eefecb5c9f43b97ff735d45fe5bcf680749e1340 Mon Sep 17 00:00:00 2001 From: Olivier Blanvillain Date: Tue, 23 May 2017 10:55:29 +0200 Subject: [PATCH 8/8] Move documentation to the root --- README.md | 14 +++++++------- scripts/docs-build.sh | 2 ++ scripts/docs-publish.sh | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 518a8504..fef135d7 100644 --- a/README.md +++ b/README.md @@ -18,13 +18,13 @@ associated channels (e.g. GitHub, Gitter) to be a safe and friendly environment ## Documentation -* [TypedDataset: Feature Overview](http://typelevel.github.io/frameless/docs/book/FeatureOverview.html) -* [Comparing TypedDatasets with Spark's Datasets](http://typelevel.github.io/frameless/docs/book/TypedDatasetVsSparkDataset.html) -* [Typed Encoders in Frameless](http://typelevel.github.io/frameless/docs/book/TypedEncoder.html) -* [Injection: Creating Custom Encoders](http://typelevel.github.io/frameless/docs/book/Injection.html) -* [Job\[A\]](http://typelevel.github.io/frameless/docs/book/Job.htmlé) -* [Using Cats with RDDs](http://typelevel.github.io/frameless/docs/book/Cats.html) -* [Proof of Concept: TypedDataFrame](http://typelevel.github.io/frameless/docs/book/TypedDataFrame.html) +* [TypedDataset: Feature Overview](http://typelevel.org/frameless/FeatureOverview.html) +* [Comparing TypedDatasets with Spark's Datasets](http://typelevel.org/frameless/TypedDatasetVsSparkDataset.html) +* [Typed Encoders in Frameless](http://typelevel.org/frameless/TypedEncoder.html) +* [Injection: Creating Custom Encoders](http://typelevel.org/frameless/Injection.html) +* [Job\[A\]](http://typelevel.org/frameless/Job.html) +* [Using Cats with RDDs](http://typelevel.org/frameless/Cats.html) +* [Proof of Concept: TypedDataFrame](http://typelevel.org/frameless/TypedDataFrame.html) ## Why? diff --git a/scripts/docs-build.sh b/scripts/docs-build.sh index 1b243684..2d96c5f8 100644 --- a/scripts/docs-build.sh +++ b/scripts/docs-build.sh @@ -13,4 +13,6 @@ fi $gitbook build docs/target/tut docs/book +mv docs/book/* . + exit 0 diff --git a/scripts/docs-publish.sh b/scripts/docs-publish.sh index be6cc66d..013383ed 100644 --- a/scripts/docs-publish.sh +++ b/scripts/docs-publish.sh @@ -19,7 +19,7 @@ git commit -am "Rebuild documentation ($commit)" echo "Verify that you didn't break anything:" echo " $ python -m SimpleHTTPServer 8000" -echo " $ xdg-open http://localhost:8000/docs/book/" +echo " $ xdg-open http://localhost:8000/" echo "" echo "Then push to the gh-pages branch:" echo " $ git push gh-pages"