Merge pull request #2 from valencik/common-analyzers

Common analyzers
cozydev-pink · Jun 21, 2022 · 221c286 · 221c286
2 parents 24aaba9 + bfc49f8
commit 221c286
Show file tree

Hide file tree

Showing 3 changed files with 84 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -23,3 +23,6 @@ metals.sbt
 
 # npm
 node_modules/
+
+# macOS
+.DS_Store
diff --git a/lucene/src/main/scala/textmogrify/lucene/Analyzers.scala b/lucene/src/main/scala/textmogrify/lucene/Analyzers.scala
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2022 Pig.io
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package textmogrify.lucene
+
+import org.apache.lucene.analysis.Analyzer.TokenStreamComponents
+import org.apache.lucene.analysis.standard.StandardTokenizer
+import org.apache.lucene.analysis.en.PorterStemFilter
+import org.apache.lucene.analysis.LowerCaseFilter
+import org.apache.lucene.analysis.Analyzer
+import org.apache.lucene.analysis.en.EnglishAnalyzer
+import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter
+
+object Analyzers {
+
+  def englishStandard(): Analyzer = new EnglishAnalyzer()
+
+  def porterStemmer(): Analyzer =
+    new Analyzer {
+      protected def createComponents(fieldName: String): TokenStreamComponents = {
+        val source = new StandardTokenizer()
+        val tokens = new LowerCaseFilter(source)
+        new TokenStreamComponents(source, new PorterStemFilter(tokens))
+      }
+    }
+
+  def asciiFolder(): Analyzer =
+    new Analyzer {
+      protected def createComponents(fieldName: String): TokenStreamComponents = {
+        val source = new StandardTokenizer()
+        new TokenStreamComponents(source, new ASCIIFoldingFilter(source))
+      }
+    }
+
+}
diff --git a/lucene/src/test/scala/textmogrify/lucene/AnalyzersSuite.scala b/lucene/src/test/scala/textmogrify/lucene/AnalyzersSuite.scala
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2022 Pig.io
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package textmogrify
+package lucene
+
+import munit.CatsEffectSuite
+import cats.effect._
+
+class AnalyzersSuite extends CatsEffectSuite {
+
+  test("asciiFolder should fold") {
+    val tokenizer = AnalyzerResource.tokenizer[IO](Analyzers.asciiFolder())
+    val actual = tokenizer.use { f =>
+      f("I like jalapeños")
+    }
+    assertIO(actual, Vector("I", "like", "jalapenos"))
+  }
+
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -23,3 +23,6 @@ metals.sbt @@
     # npm
     node_modules/
+    # macOS
+    .DS_Store