Skip to content

Commit

Permalink
Merge pull request #51 from valencik/rework-tokenizer
Browse files Browse the repository at this point in the history
Change Tokenizer to take Resource[F, Analyzer]
  • Loading branch information
valencik authored Oct 20, 2022
2 parents e016823 + 431e015 commit d6a1158
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,9 @@ sealed abstract class AnalyzerBuilder private[lucene] (config: Config) {
/** Build the Analyzer wrapped inside a Resource. */
def build[F[_]](implicit F: Sync[F]): Resource[F, Analyzer]

/** Directly construct a tokenizing function
*/
/** Build a tokenizing function that uses the Analyzer and collects tokens in a vector */
def tokenizer[F[_]](implicit F: Sync[F]): Resource[F, String => F[Vector[String]]] =
build.map(a => Tokenizer.vectorTokenizer(a))
Tokenizer.vectorTokenizer(build)

private[lucene] def mkFromStandardTokenizer[F[_]](
config: Config
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,4 @@ object AnalyzerResource {
*/
def fromAnalyzer[F[_]](analyzer: => Analyzer)(implicit F: Sync[F]): Resource[F, Analyzer] =
Resource.make(F.delay(analyzer))(analyzer => F.delay(analyzer.close()))

/** Construct a tokenizing function directly from an Analyzer
*/
def tokenizer[F[_]](
analyzer: => Analyzer
)(implicit F: Sync[F]): Resource[F, String => F[Vector[String]]] =
fromAnalyzer(analyzer)
.map(a => Tokenizer.vectorTokenizer(a))
}
8 changes: 6 additions & 2 deletions lucene/src/main/scala/textmogrify/lucene/Tokenizer.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

package textmogrify.lucene

import cats.effect.Resource
import cats.effect.kernel.Sync
import scala.collection.mutable.ArrayBuffer
import java.io.StringReader
Expand All @@ -27,8 +28,10 @@ object Tokenizer {
/** Build a tokenizing function that runs its input through the Analyzer and collects
* all tokens into a `Vector`
*/
def vectorTokenizer[F[_]](analyzer: Analyzer)(implicit F: Sync[F]): String => F[Vector[String]] =
(s: String) =>
def vectorTokenizer[F[_]](
analyzer: Resource[F, Analyzer]
)(implicit F: Sync[F]): Resource[F, String => F[Vector[String]]] =
analyzer.map { analyzer => (s: String) =>
F.delay {
val ts = analyzer.tokenStream("textmogrify-field", new StringReader(s))
val termAtt = ts.addAttribute(classOf[CharTermAttribute])
Expand All @@ -42,4 +45,5 @@ object Tokenizer {
ts.close()
arr.toVector
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,18 @@ import org.apache.lucene.analysis.en.EnglishAnalyzer
class AnalyzerResourceSuite extends CatsEffectSuite {

test("tokenizer should work") {
val analyzer = AnalyzerResource.tokenizer[IO](new EnglishAnalyzer())
val actual = analyzer.use { f =>
val analyzer = AnalyzerResource.fromAnalyzer[IO](new EnglishAnalyzer())
val tokenizer = Tokenizer.vectorTokenizer(analyzer)
val actual = tokenizer.use { f =>
f("Hello my name is Neeko")
}
assertIO(actual, Vector("hello", "my", "name", "neeko"))
}

test("tokenizer should yield a func that can be used multiple times") {
val analyzer = AnalyzerResource.tokenizer[IO](new EnglishAnalyzer())
val actual = analyzer.use { f =>
val analyzer = AnalyzerResource.fromAnalyzer[IO](new EnglishAnalyzer())
val tokenizer = Tokenizer.vectorTokenizer(analyzer)
val actual = tokenizer.use { f =>
for {
v1 <- f("Hello my name is Neeko")
v2 <- f("I enjoy jumping on counters")
Expand All @@ -50,14 +52,15 @@ class AnalyzerResourceSuite extends CatsEffectSuite {
import org.apache.lucene.analysis.LowerCaseFilter
import org.apache.lucene.analysis.Analyzer

val stemmer = AnalyzerResource.tokenizer[IO](new Analyzer {
val analyzer = AnalyzerResource.fromAnalyzer[IO](new Analyzer {
protected def createComponents(fieldName: String): TokenStreamComponents = {
val source = new StandardTokenizer()
val tokens = new LowerCaseFilter(source)
new TokenStreamComponents(source, new PorterStemFilter(tokens))
}
})
val actual = stemmer.use { f =>
val tokenizer = Tokenizer.vectorTokenizer(analyzer)
val actual = tokenizer.use { f =>
for {
v1 <- f("Hello my name is Neeko")
v2 <- f("I enjoy jumping on counters")
Expand Down

0 comments on commit d6a1158

Please sign in to comment.