[SPARKNLP-855] Introducing AlbertForZeroShotClassification (#14361)

* [SPARKNLP-855] Introducing AlbertForZeroShotClassification * [SPARKNLP-855] Adding notebook examples for AlbertZeroShotClassification --------- Co-authored-by: Maziyar Panahi <maziyar.panahi@iscpif.fr>
JohnSnowLabs · Sep 1, 2024 · 9d94b9a · 9d94b9a
1 parent f47ee50
commit 9d94b9a
Show file tree

Hide file tree

Showing 11 changed files with 7,899 additions and 4 deletions.
diff --git a/examples/python/transformers/HuggingFace_in_Spark_NLP_AlbertForZeroShotClassification.ipynb b/examples/python/transformers/HuggingFace_in_Spark_NLP_AlbertForZeroShotClassification.ipynb
diff --git a/...hon/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForZeroShotClassification.ipynb b/...hon/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_AlbertForZeroShotClassification.ipynb
diff --git a/python/sparknlp/annotator/classifier_dl/__init__.py b/python/sparknlp/annotator/classifier_dl/__init__.py
@@ -51,4 +51,5 @@
 from sparknlp.annotator.classifier_dl.deberta_for_zero_shot_classification import *
 from sparknlp.annotator.classifier_dl.mpnet_for_sequence_classification import *
 from sparknlp.annotator.classifier_dl.mpnet_for_question_answering import *
-from sparknlp.annotator.classifier_dl.mpnet_for_token_classification import *
+from sparknlp.annotator.classifier_dl.mpnet_for_token_classification import *
+from sparknlp.annotator.classifier_dl.albert_for_zero_shot_classification import *
diff --git a/python/sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py b/python/sparknlp/annotator/classifier_dl/albert_for_zero_shot_classification.py
@@ -0,0 +1,211 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+"""Contains classes for AlbertForZeroShotClassification."""
+
+from sparknlp.common import *
+
+
+class AlbertForZeroShotClassification(AnnotatorModel,
+                                      HasCaseSensitiveProperties,
+                                      HasBatchedAnnotate,
+                                      HasClassifierActivationProperties,
+                                      HasCandidateLabelsProperties,
+                                      HasEngine,
+                                      HasMaxSentenceLengthLimit):
+    """AlbertForZeroShotClassification using a `ModelForSequenceClassification` trained on NLI (natural language
+    inference) tasks. Equivalent of `DistilBertForSequenceClassification` models, but these models don't require a hardcoded
+    number of potential classes, they can be chosen at runtime. It usually means it's slower but it is much more
+    flexible.
+
+    Note that the model will loop through all provided labels. So the more labels you have, the
+    longer this process will take.
+
+    Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
+    pair and passed to the pretrained model.
+
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+
+    >>> sequenceClassifier = AlbertForZeroShotClassification.pretrained() \\
+    ...     .setInputCols(["token", "document"]) \\
+    ...     .setOutputCol("label")
+
+    The default model is ``"albert_base_zero_shot_classifier_onnx"``, if no name is
+    provided.
+
+    For available pretrained models please see the `Models Hub
+    <https://sparknlp.orgtask=Text+Classification>`__.
+
+    To see which models are compatible and how to import them see
+    `Import Transformers into Spark NLP 🚀
+    <https://github.com/JohnSnowLabs/spark-nlp/discussions/5669>`_.
+
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT, TOKEN``    ``CATEGORY``
+    ====================== ======================
+
+    Parameters
+    ----------
+    batchSize
+        Batch size. Large values allows faster processing but requires more
+        memory, by default 8
+    caseSensitive
+        Whether to ignore case in tokens for embeddings matching, by default
+        True
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    maxSentenceLength
+        Max sentence length to process, by default 128
+    coalesceSentences
+        Instead of 1 class per sentence (if inputCols is `sentence`) output 1
+        class per document by averaging probabilities in all sentences, by
+        default False
+    activation
+        Whether to calculate logits via Softmax or Sigmoid, by default
+        `"softmax"`.
+
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> tokenizer = Tokenizer() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("token")
+    >>> sequenceClassifier = AlbertForZeroShotClassification.pretrained() \\
+    ...     .setInputCols(["token", "document"]) \\
+    ...     .setOutputCol("label") \\
+    ...     .setCaseSensitive(True)
+    >>> pipeline = Pipeline().setStages([
+    ...     documentAssembler,
+    ...     tokenizer,
+    ...     sequenceClassifier
+    ... ])
+    >>> data = spark.createDataFrame([["I have a problem with my iphone that needs to be resolved asap!!"]]).toDF("text")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.select("label.result").show(truncate=False)
+    +---------+
+    |result   |
+    +---------+
+    |[urgent] |
+    +---------+
+    """
+    name = "AlbertForZeroShotClassification"
+
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT, AnnotatorType.TOKEN]
+
+    outputAnnotatorType = AnnotatorType.CATEGORY
+
+    configProtoBytes = Param(Params._dummy(),
+                             "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+
+    coalesceSentences = Param(Params._dummy(), "coalesceSentences",
+                              "Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging probabilities in all sentences.",
+                              TypeConverters.toBoolean)
+
+    def getClasses(self):
+        """
+        Returns labels used to train this model
+        """
+        return self._call_java("getClasses")
+
+    def setConfigProtoBytes(self, b):
+        """Sets configProto from tensorflow, serialized into byte array.
+
+        Parameters
+        ----------
+        b : List[int]
+            ConfigProto from tensorflow, serialized into byte array
+        """
+        return self._set(configProtoBytes=b)
+
+    def setCoalesceSentences(self, value):
+        """Instead of 1 class per sentence (if inputCols is '''sentence''') output 1 class per document by averaging
+        probabilities in all sentences. Due to max sequence length limit in almost all transformer models such as Bart
+        (512 tokens), this parameter helps to feed all the sentences into the model and averaging all the probabilities
+        for the entire document instead of probabilities per sentence. (Default: true)
+
+        Parameters
+        ----------
+        value : bool
+            If the output of all sentences will be averaged to one output
+        """
+        return self._set(coalesceSentences=value)
+
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.classifier.dl.AlbertForZeroShotClassification",
+                 java_model=None):
+        super(AlbertForZeroShotClassification, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+        self._setDefault(
+            batchSize=8,
+            maxSentenceLength=128,
+            caseSensitive=True,
+            coalesceSentences=False,
+            activation="softmax"
+        )
+
+    @staticmethod
+    def loadSavedModel(folder, spark_session):
+        """Loads a locally saved model.
+
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+            spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+
+        Returns
+        -------
+        AlbertForZeroShotClassification
+            The restored model
+        """
+        from sparknlp.internal import _AlbertForZeroShotClassificationLoader
+        jModel = _AlbertForZeroShotClassificationLoader(folder, spark_session._jsparkSession)._java_obj
+        return AlbertForZeroShotClassification(java_model=jModel)
+
+    @staticmethod
+    def pretrained(name="albert_zero_shot_classifier_onnx", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default
+            "albert_zero_shot_classifier_onnx"
+            lang : str, optional
+            Language of the pretrained model, by default "en"
+            remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+
+        Returns
+        -------
+        BartForZeroShotClassification
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(AlbertForZeroShotClassification, name, lang, remote_loc)
diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py
@@ -58,6 +58,15 @@ def __init__(self, path, jspark):
         )
 
 
+class _AlbertForZeroShotClassificationLoader(ExtendedJavaWrapper):
+    def __init__(self, path, jspark):
+        super(_AlbertForZeroShotClassificationLoader, self).__init__(
+            "com.johnsnowlabs.nlp.annotators.classifier.dl.AlbertForZeroShotClassification.loadSavedModel",
+            path,
+            jspark,
+        )
+
+
 class _BertLoader(ExtendedJavaWrapper):
     def __init__(self, path, jspark, use_openvino=False):
         super(_BertLoader, self).__init__(

diff --git a/python/test/annotator/classifier_dl/albert_for_zero_shot_classification_test.py b/python/test/annotator/classifier_dl/albert_for_zero_shot_classification_test.py
@@ -0,0 +1,60 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import unittest
+
+import pytest
+
+from sparknlp.annotator import *
+from sparknlp.base import *
+from test.annotator.common.has_max_sentence_length_test import HasMaxSentenceLengthTests
+from test.util import SparkContextForTest
+
+
+@pytest.mark.slow
+class AlbertForZeroShotClassificationTestSpec(unittest.TestCase, HasMaxSentenceLengthTests):
+    def setUp(self):
+        self.text = "I have a problem with my iphone that needs to be resolved asap!!"
+        self.data = SparkContextForTest.spark \
+            .createDataFrame([[self.text]]).toDF("text")
+        self.candidate_labels = ["urgent", "mobile", "technology"]
+
+        self.tested_annotator = AlbertForZeroShotClassification \
+            .pretrained()\
+            .setInputCols(["document", "token"]) \
+            .setOutputCol("multi_class") \
+            .setCandidateLabels(self.candidate_labels)
+
+    def test_run(self):
+        document_assembler = DocumentAssembler() \
+            .setInputCol("text") \
+            .setOutputCol("document")
+
+        tokenizer = Tokenizer().setInputCols("document").setOutputCol("token")
+
+        doc_classifier = self.tested_annotator
+
+        pipeline = Pipeline(stages=[
+            document_assembler,
+            tokenizer,
+            doc_classifier
+        ])
+
+        model = pipeline.fit(self.data)
+        model.transform(self.data).show()
+
+        light_pipeline = LightPipeline(model)
+        annotations_result = light_pipeline.fullAnnotate(self.text)
+        multi_class_result = annotations_result[0]["multi_class"][0].result
+        self.assertIn(multi_class_result, self.candidate_labels)
diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/AlbertClassification.scala b/src/main/scala/com/johnsnowlabs/ml/ai/AlbertClassification.scala
@@ -25,10 +25,12 @@ import com.johnsnowlabs.ml.tensorflow.sign.{ModelSignatureConstants, ModelSignat
 import com.johnsnowlabs.ml.tensorflow.{TensorResources, TensorflowWrapper}
 import com.johnsnowlabs.ml.util.{ONNX, Openvino, TensorFlow}
 import com.johnsnowlabs.nlp.annotators.common._
+import com.johnsnowlabs.nlp.annotators.tokenizer.wordpiece.BasicTokenizer
 import com.johnsnowlabs.nlp.{ActivationFunction, Annotation}
 import org.intel.openvino.Tensor
 import org.tensorflow.ndarray.buffer.IntDataBuffer
 import org.slf4j.{Logger, LoggerFactory}
+import org.tensorflow.ndarray.buffer.IntDataBuffer
 
 import scala.collection.JavaConverters._
 
@@ -95,7 +97,19 @@ private[johnsnowlabs] class AlbertClassification(
   def tokenizeSeqString(
       candidateLabels: Seq[String],
       maxSeqLength: Int,
-      caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = ???
+      caseSensitive: Boolean): Seq[WordpieceTokenizedSentence] = {
+    val basicTokenizer = new BasicTokenizer(caseSensitive)
+    val encoder =
+      new SentencepieceEncoder(spp, caseSensitive, sentencePieceDelimiterId, pieceIdOffset = 1)
+
+    val labelsToSentences = candidateLabels.map { s => Sentence(s, 0, s.length - 1, 0) }
+
+    labelsToSentences.map(label => {
+      val tokens = basicTokenizer.tokenize(label)
+      val wordpieceTokens = tokens.flatMap(token => encoder.encode(token)).take(maxSeqLength)
+      WordpieceTokenizedSentence(wordpieceTokens)
+    })
+  }
 
   def tokenizeDocument(
       docs: Seq[Annotation],
@@ -310,7 +324,30 @@ private[johnsnowlabs] class AlbertClassification(
       batch: Seq[Array[Int]],
       entailmentId: Int,
       contradictionId: Int,
-      activation: String): Array[Array[Float]] = ???
+      activation: String): Array[Array[Float]] = {
+
+    val maxSentenceLength = batch.map(encodedSentence => encodedSentence.length).max
+    val paddedBatch = batch.map(arr => padArrayWithZeros(arr, maxSentenceLength))
+    val batchLength = paddedBatch.length
+
+    val rawScores = detectedEngine match {
+      case TensorFlow.name => getRawScoresWithTF(paddedBatch, maxSentenceLength)
+      case ONNX.name => getRawScoresWithOnnx(paddedBatch, maxSentenceLength, sequence = true)
+    }
+
+    val dim = rawScores.length / batchLength
+    rawScores
+      .grouped(dim)
+      .toArray
+  }
+
+  private def padArrayWithZeros(arr: Array[Int], maxLength: Int): Array[Int] = {
+    if (arr.length >= maxLength) {
+      arr
+    } else {
+      arr ++ Array.fill(maxLength - arr.length)(0)
+    }
+  }
 
   def tagSpan(batch: Seq[Array[Int]]): (Array[Array[Float]], Array[Array[Float]]) = {
     val batchLength = batch.length

diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala
@@ -432,6 +432,13 @@ package object annotator {
       extends ReadablePretrainedAlbertForTokenModel
       with ReadAlbertForTokenDLModel
 
+  type AlbertForZeroShotClassification =
+    com.johnsnowlabs.nlp.annotators.classifier.dl.AlbertForZeroShotClassification
+
+  object AlbertForZeroShotClassification
+      extends ReadablePretrainedAlbertForZeroShotModel
+      with ReadAlbertForZeroShotDLModel
+
   type XlnetForTokenClassification =
     com.johnsnowlabs.nlp.annotators.classifier.dl.XlnetForTokenClassification