Skip to content

Commit

Permalink
Merge pull request #84 from lambdaofgod/tokenizer-normalizer-params
Browse files Browse the repository at this point in the history
Make lowercasing by Normalizer optional
  • Loading branch information
saif-ellafi authored Jan 19, 2018
2 parents f5004b9 + fa98d36 commit f425d84
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 5 deletions.
1 change: 1 addition & 0 deletions docs/components.html
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ <h4 id="Normalizer" class="section-block">3. Normalizer: Text cleaning</h4>
<ul>
<li>
setPattern(pattern): Regular expression for normalization, defaults [^A-Za-z]
setLowercase(value): lowercase tokens, default true
</li>
</ul>
<b>Example:</b><br>
Expand Down
6 changes: 6 additions & 0 deletions python/sparknlp/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ class Normalizer(AnnotatorTransformer):
"normalization regex pattern which match will be replaced with a space",
typeConverter=TypeConverters.toString)

lowercase = Param(Params._dummy(),
"lowercase",
"whether to convert strings to lowercase")

@keyword_only
def __init__(self):
super(Normalizer, self).__init__()
Expand All @@ -123,6 +127,8 @@ def __init__(self):
def setPattern(self, value):
return self._set(pattern=value)

def setLowercase(self, value):
return self._set(lowercase=value)

class RegexMatcher(AnnotatorTransformer):

Expand Down
20 changes: 20 additions & 0 deletions python/test/annotators.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,26 @@ def runTest(self):
lemmatizer.transform(tokenized).show()


class NormalizerTestSpec(unittest.TestCase):

def setUp(self):
self.data = SparkContextForTest.data

def runTest(self):
document_assembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")
tokenizer = RegexTokenizer() \
.setOutputCol("token")
lemmatizer = Normalizer() \
.setInputCols(["token"]) \
.setOutputCol("normalized_token") \
.setLowercase(False)
assembled = document_assembler.transform(self.data)
tokenized = tokenizer.transform(assembled)
lemmatizer.transform(tokenized).show()


class DateMatcherTestSpec(unittest.TestCase):

def setUp(self):
Expand Down
15 changes: 12 additions & 3 deletions src/main/scala/com/johnsnowlabs/nlp/annotators/Normalizer.scala
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package com.johnsnowlabs.nlp.annotators

import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel}
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.param.{BooleanParam, Param}
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}

/**
Expand All @@ -17,20 +17,29 @@ class Normalizer(override val uid: String) extends AnnotatorModel[Normalizer] {
override val requiredAnnotatorTypes: Array[AnnotatorType] = Array(TOKEN)

val pattern = new Param[String](this, "pattern", "normalization regex pattern which match will be replaced with a space")
val lowercase = new BooleanParam(this, "lowercase", "whether to convert strings to lowercase")

setDefault(pattern, "[^a-zA-Z]")
setDefault(lowercase, true)

def getPattern: String = $(pattern)

def setPattern(value: String): this.type = set(pattern, value)

def getLowercase: Boolean = $(lowercase)

def setLowercase(value: Boolean): this.type = set(lowercase, value)

def this() = this(Identifiable.randomUID("NORMALIZER"))

/** ToDo: Review implementation, Current implementation generates spaces between non-words, potentially breaking tokens*/
override def annotate(annotations: Seq[Annotation]): Seq[Annotation] =
annotations.map { token =>
val nToken = token.result
.toLowerCase
val cased =
if ($(lowercase)) token.result.toLowerCase
else token.result

val nToken = cased
.replaceAll($(pattern), "")
.trim
Annotation(
Expand Down
8 changes: 8 additions & 0 deletions src/test/scala/com/johnsnowlabs/nlp/AnnotatorBuilder.scala
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@ object AnnotatorBuilder extends FlatSpec { this: Suite =>
normalizer.transform(withTokenizer(dataset))
}

def withCaseSensitiveNormalizer(dataset: Dataset[Row]): Dataset[Row] = {
val normalizer = new Normalizer()
.setInputCols(Array("token"))
.setOutputCol("normalized")
.setLowercase(false)
normalizer.transform(withTokenizer(dataset))
}

def withFullLemmatizer(dataset: Dataset[Row]): Dataset[Row] = {
val lemmatizer = new Lemmatizer()
.setInputCols(Array("token"))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ trait NormalizerBehaviors { this: FlatSpec =>
AnnotatorBuilder.withFullNormalizer(dataset)
.collect().foreach {
row =>
row.getSeq[Row](3)
row.getSeq[Row](4)
.map(Annotation(_))
.foreach {
case stem: Annotation if stem.annotatorType == AnnotatorType.TOKEN =>
Expand All @@ -21,4 +21,25 @@ trait NormalizerBehaviors { this: FlatSpec =>
}
}
}

def lowercasingNormalizerPipeline(dataset: => Dataset[Row]) {
"A case-sensitive Normalizer Annotator" should "successfully transform data" in {
AnnotatorBuilder.withCaseSensitiveNormalizer(dataset)
.collect().foreach {
row =>
val tokens = row.getSeq[Row](3).map(Annotation(_))
val normalizedAnnotations = row.getSeq[Row](4).map(Annotation(_))
normalizedAnnotations.foreach {
case stem: Annotation if stem.annotatorType == AnnotatorType.TOKEN =>
assert(stem.result.nonEmpty, "Annotation result exists")
case _ =>
}

normalizedAnnotations.zip(tokens).foreach {
case (stem: Annotation, token: Annotation) =>
assert(stem.result == token.result.replaceAll("[^a-zA-Z]", ""))
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ class NormalizerTestSpec extends FlatSpec with NormalizerBehaviors {
val latinBodyData: Dataset[Row] = DataBuilder.basicDataBuild(ContentProvider.latinBody)

"A full Normalizer pipeline with latin content" should behave like fullNormalizerPipeline(latinBodyData)

"A Normalizer pipeline with latin content and disabled lowercasing" should behave like lowercasingNormalizerPipeline(latinBodyData)
}

0 comments on commit f425d84

Please sign in to comment.