diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1398721..199388e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -81,11 +81,11 @@ jobs: - name: Make target directories if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main') - run: mkdir -p lucene/target target unidocs/target .js/target site/target .jvm/target .native/target example/target project/target + run: mkdir -p lucene/target benchmarks/target target unidocs/target .js/target site/target .jvm/target .native/target example/target project/target - name: Compress target directories if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main') - run: tar cf targets.tar lucene/target target unidocs/target .js/target site/target .jvm/target .native/target example/target project/target + run: tar cf targets.tar lucene/target benchmarks/target target unidocs/target .js/target site/target .jvm/target .native/target example/target project/target - name: Upload target directories if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main') diff --git a/benchmarks/src/main/scala/textmogrify/LuceneStreamingBenchmark.scala b/benchmarks/src/main/scala/textmogrify/LuceneStreamingBenchmark.scala new file mode 100644 index 0000000..2a1dd30 --- /dev/null +++ b/benchmarks/src/main/scala/textmogrify/LuceneStreamingBenchmark.scala @@ -0,0 +1,77 @@ +/* + * Copyright 2022 Pig.io + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package textmogrify +package benchmarks + +import cats.effect.IO +import cats.effect.unsafe.implicits.global +import fs2.Stream +import fs2.io.file.{Files, Path} + +import java.util.concurrent.TimeUnit +import org.openjdk.jmh.annotations._ +import textmogrify.lucene.AnalyzerPipe +import textmogrify.lucene.AnalyzerBuilder + +/** To run the benchmark from within sbt: + * + * jmh:run -i 10 -wi 10 -f 2 -t 1 textmogrify.benchmarks.LuceneStreamingBenchmark + * + * Which means "10 iterations", "10 warm-up iterations", "2 forks", "1 thread". Please note that + * benchmarks should be usually executed at least in 10 iterations (as a rule of thumb), but + * more is better. + */ +@State(Scope.Thread) +@BenchmarkMode(Array(Mode.Throughput)) +@OutputTimeUnit(TimeUnit.SECONDS) +class LuceneStreamingBenchmark { + + var asciiBytes: Array[Byte] = _ + @Setup + def setup(): Unit = + asciiBytes = Files[IO] + .readAll(Path("../LICENSE")) + .compile + .to(Array) + .unsafeRunSync() + + @Benchmark + def tokenizeBytesTokenN1(): String = { + val analyzer = AnalyzerBuilder.default.withLowerCasing.build[IO] + val pipe = AnalyzerPipe.fromResource(analyzer) + val bytes: Stream[IO, Byte] = Stream.emits(asciiBytes) + pipe + .tokenizeBytes(bytes, 1) + .compile + .last + .unsafeRunSync() + .get + } + + @Benchmark + def tokenizeBytesTokenN128(): String = { + val analyzer = AnalyzerBuilder.default.withLowerCasing.build[IO] + val pipe = AnalyzerPipe.fromResource(analyzer) + val bytes: Stream[IO, Byte] = Stream.emits(asciiBytes) + pipe + .tokenizeBytes(bytes, 128) + .compile + .last + .unsafeRunSync() + .get + } +} diff --git a/benchmarks/src/main/scala/textmogrify/LuceneTokenizationBenchmark.scala b/benchmarks/src/main/scala/textmogrify/LuceneTokenizationBenchmark.scala new file mode 100644 index 0000000..eb913ff --- /dev/null +++ b/benchmarks/src/main/scala/textmogrify/LuceneTokenizationBenchmark.scala @@ -0,0 +1,80 @@ +/* + * Copyright 2022 Pig.io + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package textmogrify +package benchmarks + +import cats.syntax.all._ +import cats.effect.IO +import cats.effect.unsafe.implicits.global +import fs2.text +import fs2.io.file.{Files, Path} + +import java.util.concurrent.TimeUnit +import org.openjdk.jmh.annotations._ +import textmogrify.lucene.AnalyzerBuilder + +/** To run the benchmark from within sbt: + * + * jmh:run -i 10 -wi 10 -f 2 -t 1 textmogrify.benchmarks.LuceneTokenizationBenchmark + * + * Which means "10 iterations", "10 warm-up iterations", "2 forks", "1 thread". Please note that + * benchmarks should be usually executed at least in 10 iterations (as a rule of thumb), but + * more is better. + */ +@State(Scope.Thread) +@BenchmarkMode(Array(Mode.Throughput)) +@OutputTimeUnit(TimeUnit.SECONDS) +class LuceneTokenizationBenchmark { + + var lines: Vector[String] = _ + @Setup + def setup(): Unit = + lines = Files[IO] + .readAll(Path("../LICENSE")) + .through(text.utf8.decode) + .through(text.lines) + .compile + .toVector + .unsafeRunSync() + + @Benchmark + def doNothing(): Vector[String] = { + val tokenizer = AnalyzerBuilder.default.withLowerCasing.tokenizer[IO] + tokenizer + .use(_ => lines.traverse(x => IO.pure(Vector(x)))) + .unsafeRunSync() + .last + } + + @Benchmark + def manualToLowerCaseAndSplit(): Vector[String] = { + val tokenizer = AnalyzerBuilder.default.withLowerCasing.tokenizer[IO] + tokenizer + .use(_ => lines.traverse(x => IO.pure(x.toLowerCase.split(" ").toVector))) + .unsafeRunSync() + .last + } + + @Benchmark + def tokenizeAndLowerCase(): Vector[String] = { + val tokenizer = AnalyzerBuilder.default.withLowerCasing.tokenizer[IO] + tokenizer + .use(f => lines.traverse(f)) + .unsafeRunSync() + .last + } +} diff --git a/build.sbt b/build.sbt index 9be822e..44cff19 100644 --- a/build.sbt +++ b/build.sbt @@ -33,7 +33,7 @@ val luceneV = "9.3.0" val munitV = "1.0.0-M6" val munitCatsEffectV = "2.0.0-M1" -lazy val root = tlCrossRootProject.aggregate(lucene, example, unidocs) +lazy val root = tlCrossRootProject.aggregate(lucene, example, unidocs, benchmarks) lazy val lucene = project .in(file("lucene")) @@ -75,3 +75,12 @@ lazy val unidocs = project name := "textmogrify-docs", ScalaUnidoc / unidoc / unidocProjectFilter := inProjects(lucene), ) + +lazy val benchmarks = project + .in(file("benchmarks")) + .dependsOn(lucene) + .settings( + name := "textmogrify-benchmarks", + libraryDependencies += "org.typelevel" %% "cats-effect" % catsEffectV, + ) + .enablePlugins(NoPublishPlugin, JmhPlugin) diff --git a/project/plugins.sbt b/project/plugins.sbt index a79aec3..109da51 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,2 +1,3 @@ addSbtPlugin("org.typelevel" % "sbt-typelevel" % "0.5.0-M5") addSbtPlugin("org.typelevel" % "sbt-typelevel-site" % "0.5.0-M5") +addSbtPlugin("pl.project13.scala" % "sbt-jmh" % "0.4.3")