From 4583ccf21658dca46248824740f7ae8df8ad63c9 Mon Sep 17 00:00:00 2001
From: Prabod Rathnayaka <prabod@rathnayaka.me>
Date: Mon, 10 Jun 2024 22:37:19 +1000
Subject: [PATCH] SparkNLP - 995 Introducing MistralAI LLMs (#14318)

* added mistral

* Mistral python API
---
 python/sparknlp/annotator/seq2seq/__init__.py |   1 +
 .../annotator/seq2seq/mistral_transformer.py  | 349 +++++++++++++
 python/sparknlp/internal/__init__.py          |   5 +
 .../seq2seq/mistral_transformer_test.py       |  46 ++
 .../com/johnsnowlabs/ml/ai/Mistral.scala      | 445 +++++++++++++++++
 .../seq2seq/MistralTransformer.scala          | 461 ++++++++++++++++++
 .../annotators/seq2seq/MistralTestSpec.scala  |  51 ++
 7 files changed, 1358 insertions(+)
 create mode 100644 python/sparknlp/annotator/seq2seq/mistral_transformer.py
 create mode 100644 python/test/annotator/seq2seq/mistral_transformer_test.py
 create mode 100644 src/main/scala/com/johnsnowlabs/ml/ai/Mistral.scala
 create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MistralTransformer.scala
 create mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MistralTestSpec.scala

diff --git a/python/sparknlp/annotator/seq2seq/__init__.py b/python/sparknlp/annotator/seq2seq/__init__.py
index 5abf7be0d12dfb..f55474504816ee 100644
--- a/python/sparknlp/annotator/seq2seq/__init__.py
+++ b/python/sparknlp/annotator/seq2seq/__init__.py
@@ -19,3 +19,4 @@
 from sparknlp.annotator.seq2seq.bart_transformer import *
 from sparknlp.annotator.seq2seq.llama2_transformer import *
 from sparknlp.annotator.seq2seq.m2m100_transformer import *
+from sparknlp.annotator.seq2seq.mistral_transformer import *
diff --git a/python/sparknlp/annotator/seq2seq/mistral_transformer.py b/python/sparknlp/annotator/seq2seq/mistral_transformer.py
new file mode 100644
index 00000000000000..29eff367e5b52f
--- /dev/null
+++ b/python/sparknlp/annotator/seq2seq/mistral_transformer.py
@@ -0,0 +1,349 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the MistralTransformer."""
+
+from sparknlp.common import *
+
+
+class MistralTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
+    """Mistral 7B
+
+    Mistral 7B, a 7.3 billion-parameter model that stands out for its efficient and effective
+    performance in natural language processing. Surpassing Llama 2 13B across all benchmarks and
+    excelling over Llama 1 34B in various aspects, Mistral 7B strikes a balance between English
+    language tasks and code comprehension, rivaling the capabilities of CodeLlama 7B in the
+    latter.
+
+    Mistral 7B introduces Grouped-query attention (GQA) for quicker inference, enhancing
+    processing speed without compromising accuracy. This streamlined approach ensures a smoother
+    user experience, making Mistral 7B a practical choice for real-world applications.
+
+    Additionally, Mistral 7B adopts Sliding Window Attention (SWA) to efficiently handle longer
+    sequences at a reduced computational cost. This feature enhances the model's ability to
+    process extensive textual input, expanding its utility in handling more complex tasks.
+
+    In summary, Mistral 7B represents a notable advancement in language models, offering a
+    reliable and versatile solution for various natural language processing challenges.
+
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+
+    >>> mistral = MistralTransformer.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("generation")
+
+
+    The default model is ``"mistral-7b"``, if no name is provided. For available
+    pretrained models please see the `Models Hub
+    <https://sparknlp.org/models?q=mistral>`__.
+
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``           ``DOCUMENT``
+    ====================== ======================
+
+    Parameters
+    ----------
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    minOutputLength
+        Minimum length of the sequence to be generated, by default 0
+    maxOutputLength
+        Maximum length of output text, by default 20
+    doSample
+        Whether or not to use sampling; use greedy decoding otherwise, by default False
+    temperature
+        The value used to module the next token probabilities, by default 1.0
+    topK
+        The number of highest probability vocabulary tokens to keep for
+        top-k-filtering, by default 50
+    topP
+        Top cumulative probability for vocabulary tokens, by default 1.0
+
+        If set to float < 1, only the most probable tokens with probabilities
+        that add up to ``topP`` or higher are kept for generation.
+    repetitionPenalty
+        The parameter for repetition penalty, 1.0 means no penalty. , by default
+        1.0
+    noRepeatNgramSize
+        If set to int > 0, all ngrams of that size can only occur once, by
+        default 0
+    ignoreTokenIds
+        A list of token ids which are ignored in the decoder's output, by
+        default []
+
+    Notes
+    -----
+    This is a very computationally expensive module especially on larger
+    sequence. The use of an accelerator such as GPU is recommended.
+
+    References
+    ----------
+    - `Mistral 7B
+      <https://mistral.ai/news/announcing-mistral-7b/>`__
+    - https://github.com/mistralai/mistral-src
+
+    **Paper Abstract:**
+
+    *We introduce Mistral 7B v0.1, a 7-billion-parameter language model engineered for superior
+    performance and efficiency. Mistral 7B outperforms Llama 2 13B across all evaluated
+    benchmarks, and Llama 1 34B in reasoning, mathematics, and code generation. Our model
+    leverages grouped-query attention (GQA) for faster inference, coupled with sliding window
+    attention (SWA) to effectively handle sequences of arbitrary length with a reduced inference
+    cost. We also provide a model fine-tuned to follow instructions, Mistral 7B -- Instruct, that
+    surpasses the Llama 2 13B -- Chat model both on human and automated benchmarks. Our models are
+    released under the Apache 2.0 license.*
+
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("documents")
+    >>> mistral = MistralTransformer.pretrained("mistral-7b") \\
+    ...     .setInputCols(["documents"]) \\
+    ...     .setMaxOutputLength(50) \\
+    ...     .setOutputCol("generation")
+    >>> pipeline = Pipeline().setStages([documentAssembler, mistral])
+    >>> data = spark.createDataFrame([["My name is Leonardo."]]).toDF("text")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.select("summaries.generation").show(truncate=False)
+    +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    |result                                                                                                                                                                                              |
+    +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    |[Leonardo Da Vinci invented the microscope?\n Question: Leonardo Da Vinci invented the microscope?\n Answer: No, Leonardo Da Vinci did not invent the microscope. The first microscope was invented |
+    | in the late 16th century, long after Leonardo']                                                                                                                                                    |
+    -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    """
+
+    name = "MistralTransformer"
+
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
+
+    outputAnnotatorType = AnnotatorType.DOCUMENT
+
+
+    configProtoBytes = Param(Params._dummy(),
+                             "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+
+    minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
+                            typeConverter=TypeConverters.toInt)
+
+    maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
+                            typeConverter=TypeConverters.toInt)
+
+    doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
+                     typeConverter=TypeConverters.toBoolean)
+
+    temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
+                        typeConverter=TypeConverters.toFloat)
+
+    topK = Param(Params._dummy(), "topK",
+                 "The number of highest probability vocabulary tokens to keep for top-k-filtering",
+                 typeConverter=TypeConverters.toInt)
+
+    topP = Param(Params._dummy(), "topP",
+                 "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
+                 typeConverter=TypeConverters.toFloat)
+
+    repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
+                              "The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
+                              typeConverter=TypeConverters.toFloat)
+
+    noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
+                              "If set to int > 0, all ngrams of that size can only occur once",
+                              typeConverter=TypeConverters.toInt)
+
+    ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
+                           "A list of token ids which are ignored in the decoder's output",
+                           typeConverter=TypeConverters.toListInt)
+
+
+    def setIgnoreTokenIds(self, value):
+        """A list of token ids which are ignored in the decoder's output.
+
+        Parameters
+        ----------
+        value : List[int]
+            The words to be filtered out
+        """
+        return self._set(ignoreTokenIds=value)
+
+    def setConfigProtoBytes(self, b):
+        """Sets configProto from tensorflow, serialized into byte array.
+
+        Parameters
+        ----------
+        b : List[int]
+            ConfigProto from tensorflow, serialized into byte array
+        """
+        return self._set(configProtoBytes=b)
+
+    def setMinOutputLength(self, value):
+        """Sets minimum length of the sequence to be generated.
+
+        Parameters
+        ----------
+        value : int
+            Minimum length of the sequence to be generated
+        """
+        return self._set(minOutputLength=value)
+
+    def setMaxOutputLength(self, value):
+        """Sets maximum length of output text.
+
+        Parameters
+        ----------
+        value : int
+            Maximum length of output text
+        """
+        return self._set(maxOutputLength=value)
+
+    def setDoSample(self, value):
+        """Sets whether or not to use sampling, use greedy decoding otherwise.
+
+        Parameters
+        ----------
+        value : bool
+            Whether or not to use sampling; use greedy decoding otherwise
+        """
+        return self._set(doSample=value)
+
+    def setTemperature(self, value):
+        """Sets the value used to module the next token probabilities.
+
+        Parameters
+        ----------
+        value : float
+            The value used to module the next token probabilities
+        """
+        return self._set(temperature=value)
+
+    def setTopK(self, value):
+        """Sets the number of highest probability vocabulary tokens to keep for
+        top-k-filtering.
+
+        Parameters
+        ----------
+        value : int
+            Number of highest probability vocabulary tokens to keep
+        """
+        return self._set(topK=value)
+
+    def setTopP(self, value):
+        """Sets the top cumulative probability for vocabulary tokens.
+
+        If set to float < 1, only the most probable tokens with probabilities
+        that add up to ``topP`` or higher are kept for generation.
+
+        Parameters
+        ----------
+        value : float
+            Cumulative probability for vocabulary tokens
+        """
+        return self._set(topP=value)
+
+    def setRepetitionPenalty(self, value):
+        """Sets the parameter for repetition penalty. 1.0 means no penalty.
+
+        Parameters
+        ----------
+        value : float
+            The repetition penalty
+
+        References
+        ----------
+        See `Ctrl: A Conditional Transformer Language Model For Controllable
+        Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
+        """
+        return self._set(repetitionPenalty=value)
+
+    def setNoRepeatNgramSize(self, value):
+        """Sets size of n-grams that can only occur once.
+
+        If set to int > 0, all ngrams of that size can only occur once.
+
+        Parameters
+        ----------
+        value : int
+            N-gram size can only occur once
+        """
+        return self._set(noRepeatNgramSize=value)
+
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.MistralTransformer", java_model=None):
+        super(MistralTransformer, self).__init__(
+            classname=classname,
+            java_model=java_model
+        )
+        self._setDefault(
+            minOutputLength=0,
+            maxOutputLength=20,
+            doSample=False,
+            temperature=1,
+            topK=50,
+            topP=1,
+            repetitionPenalty=1.0,
+            noRepeatNgramSize=0,
+            ignoreTokenIds=[],
+            batchSize=1
+        )
+
+    @staticmethod
+    def loadSavedModel(folder, spark_session, use_openvino=False):
+        """Loads a locally saved model.
+
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+
+        Returns
+        -------
+        MistralTransformer
+            The restored model
+        """
+        from sparknlp.internal import _MistralLoader
+        jModel = _MistralLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
+        return MistralTransformer(java_model=jModel)
+
+    @staticmethod
+    def pretrained(name="mistral-7b", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default "mistral-7b"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+
+        Returns
+        -------
+        MistralTransformer
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(MistralTransformer, name, lang, remote_loc)
diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py
index 9b919620a16faa..c76d830e682658 100644
--- a/python/sparknlp/internal/__init__.py
+++ b/python/sparknlp/internal/__init__.py
@@ -276,6 +276,11 @@ def __init__(self, path, jspark):
         )
 
 
+class _MistralLoader(ExtendedJavaWrapper):
+    def __init__(self, path, jspark, use_openvino=False):
+        super(_MistralLoader, self).__init__(
+            "com.johnsnowlabs.nlp.annotators.seq2seq.MistralTransformer.loadSavedModel", path, jspark, use_openvino)
+
 class _MarianLoader(ExtendedJavaWrapper):
     def __init__(self, path, jspark):
         super(_MarianLoader, self).__init__(
diff --git a/python/test/annotator/seq2seq/mistral_transformer_test.py b/python/test/annotator/seq2seq/mistral_transformer_test.py
new file mode 100644
index 00000000000000..0b344d51ebe388
--- /dev/null
+++ b/python/test/annotator/seq2seq/mistral_transformer_test.py
@@ -0,0 +1,46 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import unittest
+
+import pytest
+
+from sparknlp.annotator import *
+from sparknlp.base import *
+from test.util import SparkContextForTest
+
+
+@pytest.mark.slow
+class MistralTransformerTextGenerationTestSpec(unittest.TestCase):
+    def setUp(self):
+        self.spark = SparkContextForTest.spark
+
+    def runTest(self):
+        data = self.spark.createDataFrame([
+            [1, """Leonardo Da Vinci invented the microscope?""".strip().replace("\n", " ")]]).toDF("id", "text")
+
+        document_assembler = DocumentAssembler() \
+            .setInputCol("text") \
+            .setOutputCol("documents")
+
+        mistral = MistralTransformer \
+            .pretrained() \
+            .setMaxOutputLength(50) \
+            .setDoSample(False) \
+            .setInputCols(["documents"]) \
+            .setOutputCol("generation")
+
+        pipeline = Pipeline().setStages([document_assembler, mistral])
+        results = pipeline.fit(data).transform(data)
+
+        results.select("generation.result").show(truncate=False)
diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Mistral.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Mistral.scala
new file mode 100644
index 00000000000000..58d074a90cba32
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/ml/ai/Mistral.scala
@@ -0,0 +1,445 @@
+/*
+ * Copyright 2017 - 2023  John Snow Labs
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+package com.johnsnowlabs.ml.ai
+
+import ai.onnxruntime.{OnnxTensor, OrtEnvironment, OrtSession}
+import com.johnsnowlabs.ml.ai.util.Generation.{Generate, GenerationConfig}
+import com.johnsnowlabs.ml.onnx.OnnxSession
+import com.johnsnowlabs.ml.onnx.OnnxWrapper.DecoderWrappers
+import com.johnsnowlabs.ml.onnx.TensorResources.implicits._
+import com.johnsnowlabs.ml.openvino.OpenvinoWrapper
+import com.johnsnowlabs.ml.tensorflow.sentencepiece.SentencePieceWrapper
+import com.johnsnowlabs.ml.util.{ONNX, Openvino, TensorFlow}
+import com.johnsnowlabs.nlp.Annotation
+import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT
+import org.intel.openvino.InferRequest
+import org.tensorflow.{Session, Tensor}
+
+import scala.collection.JavaConverters._
+
+private[johnsnowlabs] class Mistral(
+    val onnxWrappers: Option[DecoderWrappers],
+    val openvinoWrapper: Option[OpenvinoWrapper],
+    val spp: SentencePieceWrapper,
+    generationConfig: GenerationConfig)
+    extends Serializable
+    with Generate {
+
+  private val onnxSessionOptions: Map[String, String] = new OnnxSession().getSessionOptions
+  val detectedEngine: String =
+    if (onnxWrappers.isDefined) ONNX.name
+    else if (openvinoWrapper.isDefined) Openvino.name
+    else ONNX.name
+
+  private val GenerationConfig(
+    bosTokenId: Int,
+    paddingTokenId: Int,
+    eosTokenId: Int,
+    vocabSize: Int,
+    beginSuppressTokens,
+    suppressTokenIds,
+    forcedDecoderIds) =
+    generationConfig
+
+  private val pieceSize = spp.getSppModel.getPieceSize
+
+  /** Decode a sequence of sentences
+    * @param sentences
+    *   Sequence of sentences
+    * @return
+    *   Sequence of decoded sentences
+    */
+  def decode(sentences: Array[Array[Int]]): Seq[String] = {
+    sentences.map { s =>
+      val filteredPieceIds = s.filter(x => x <= pieceSize)
+      spp.getSppModel.decodeIds(filteredPieceIds.map(_.toInt): _*)
+    }
+  }
+
+  /** Encode a sequence of sentences
+    * @param sentences
+    *   Sequence of sentences
+    * @return
+    *   Sequence of encoded sentences
+    */
+  def encode(sentences: Seq[Annotation]): Seq[Array[Int]] = {
+    sentences.map(s => {
+      val sentWithTask = s.result
+      spp.getSppModel.encodeAsIds(sentWithTask)
+    })
+  }
+
+  def tag(
+      batch: Seq[Array[Int]],
+      minOutputLength: Int,
+      maxOutputLength: Int,
+      doSample: Boolean,
+      temperature: Double,
+      topK: Int,
+      topP: Double,
+      repetitionPenalty: Double,
+      noRepeatNgramSize: Int,
+      randomSeed: Option[Long],
+      ignoreTokenIds: Array[Int] = Array(),
+      beamSize: Int,
+      maxInputLength: Int): Array[Array[Int]] = {
+    val ignoreTokenIdsInt = ignoreTokenIds
+    val expandedDecoderInputsVals = batch
+    val sequencesLength = expandedDecoderInputsVals.map(x => x.length).toArray
+    val maxSentenceLength = sequencesLength.max // - curLen
+
+    val numReturn_sequences = 1
+    // from config
+
+    var effectiveBatch_size = 1
+    var effectiveBatch_mult = 1
+
+    if (doSample) {
+      effectiveBatch_size = expandedDecoderInputsVals.length * numReturn_sequences
+      effectiveBatch_mult = numReturn_sequences
+    } else {
+      effectiveBatch_size = expandedDecoderInputsVals.length
+      effectiveBatch_mult = 1
+    }
+
+    // Run the prompt through the decoder and get the past
+//    val decoderOutputs =
+//      generateGreedyOnnx(
+//        expandedDecoderInputsVals.toArray,
+//        (encoderSession, env),
+//        maxOutputLength)
+
+    val (decoderEncoderStateTensors, encoderAttentionMaskTensors, session) =
+      detectedEngine match {
+        case ONNX.name =>
+          // dummy tensors for decoder encode state and attention mask
+          val (encoderSession, env) = onnxWrappers.get.decoder.getSession(onnxSessionOptions)
+          (
+            Right(OnnxTensor.createTensor(env, Array(0))),
+            Right(OnnxTensor.createTensor(env, Array(1))),
+            Right((env, encoderSession)))
+        case Openvino.name =>
+          // not needed
+          (null, null, null)
+      }
+    val ovInferRequest: Option[InferRequest] = detectedEngine match {
+      case ONNX.name => None
+      case Openvino.name => Some(openvinoWrapper.get.getCompiledModel().create_infer_request())
+    }
+    // output with beam search
+    val modelOutputs = generate(
+      batch,
+      decoderEncoderStateTensors,
+      encoderAttentionMaskTensors,
+      expandedDecoderInputsVals.toArray,
+      maxOutputLength + maxSentenceLength,
+      minOutputLength,
+      doSample,
+      beamSize,
+      1,
+      temperature,
+      topK,
+      topP,
+      repetitionPenalty,
+      noRepeatNgramSize,
+      this.vocabSize,
+      this.eosTokenId,
+      this.paddingTokenId,
+      randomSeed,
+      ignoreTokenIdsInt,
+      session,
+      applySoftmax = false,
+      ovInferRequest = ovInferRequest)
+
+//    decoderOutputs
+    modelOutputs
+  }
+
+  def predict(
+      sentences: Seq[Annotation],
+      batchSize: Int,
+      minOutputLength: Int,
+      maxOutputLength: Int,
+      doSample: Boolean,
+      temperature: Double,
+      topK: Int,
+      topP: Double,
+      repetitionPenalty: Double,
+      noRepeatNgramSize: Int,
+      randomSeed: Option[Long] = None,
+      ignoreTokenIds: Array[Int] = Array(),
+      beamSize: Int,
+      maxInputLength: Int): Seq[Annotation] = {
+
+    val batchDecoder = sentences.grouped(batchSize).toArray.flatMap { batch =>
+      val batchSP = encode(batch)
+      val spIds = tag(
+        batchSP,
+        minOutputLength,
+        maxOutputLength,
+        doSample,
+        temperature,
+        topK,
+        topP,
+        repetitionPenalty,
+        noRepeatNgramSize,
+        randomSeed,
+        ignoreTokenIds,
+        beamSize,
+        maxInputLength)
+
+      decode(spIds)
+
+    }
+
+    var sentBegin, nextSentEnd = 0
+    val annotations = batchDecoder.zip(sentences).map { case (content, sent) =>
+      nextSentEnd += content.length - 1
+      val annots = new Annotation(
+        annotatorType = DOCUMENT,
+        begin = sentBegin,
+        end = nextSentEnd,
+        result = content,
+        metadata = sent.metadata)
+      sentBegin += nextSentEnd + 1
+      annots
+    }
+    annotations
+  }
+
+  private def getDecoderOutputsWithPast(
+      inputIds: Array[Array[Int]],
+      decoderPast: Map[String, OnnxTensor],
+      onnxSession: (OrtSession, OrtEnvironment))
+      : (Array[Array[Float]], Map[String, OnnxTensor]) = {
+    val (session, env) = onnxSession
+
+    val lastTokens: Array[Array[Long]] =
+      inputIds.map { tokenIds =>
+        Array(tokenIds.last.toLong)
+      }
+
+    val lastTokensTensor: OnnxTensor =
+      OnnxTensor.createTensor(env, lastTokens)
+    val decoderAttentionMask: OnnxTensor =
+      OnnxTensor.createTensor(env, lastTokens.map(_.map(_ => 1L)))
+    val decoderWithPastInputs: java.util.Map[String, OnnxTensor] = (Map(
+      OnnxSignatures.decoderInputIDs -> lastTokensTensor,
+      OnnxSignatures.decoderAttentionMask -> decoderAttentionMask) ++ decoderPast).asJava
+    val sessionOutput = session.run(decoderWithPastInputs)
+    val logits = sessionOutput.getFloatArray(OnnxSignatures.decoderOutput)
+    val decoderPresent = sessionOutput.getOnnxTensors(OnnxSignatures.decoderPresent)
+    lastTokensTensor.close()
+    val batchLogits = logits.grouped(vocabSize).toArray
+    (batchLogits, decoderPresent)
+
+  }
+
+  override def getModelOutput(
+      encoderInputIds: Seq[Array[Int]],
+      decoderInputIds: Seq[Array[Int]],
+      decoderEncoderStateTensors: Either[Tensor, OnnxTensor],
+      encoderAttentionMaskTensors: Either[Tensor, OnnxTensor],
+      maxLength: Int,
+      session: Either[Session, (OrtEnvironment, OrtSession)],
+      ovInferRequest: Option[InferRequest]): Array[Array[Float]] = {
+
+    detectedEngine match {
+      case TensorFlow.name =>
+        // not implemented yet
+        Array()
+      case ONNX.name =>
+        val (env, decoderSession) = session.right.get
+        val decoderOutputs =
+          getDecoderOutputs(decoderInputIds.toArray, onnxSession = (decoderSession, env))
+        decoderOutputs
+      case Openvino.name =>
+        val decoderOutputs =
+          getDecoderOutputsOv(
+            encoderInputIds.toArray,
+            decoderInputIds.toArray,
+            ovInferRequest.get)
+        decoderOutputs
+    }
+
+  }
+
+  private def getDecoderOutputsOv(
+      encoderInputIds: Array[Array[Int]],
+      decoderInputIds: Array[Array[Int]],
+      inferRequest: InferRequest): (Array[Array[Float]]) = {
+    val (inputIdsLong, inputPositionIDsLong): (Array[Long], Array[Long]) =
+      if (encoderInputIds.head.length == decoderInputIds.head.length) {
+        // First pass
+        val inpIdsLong = decoderInputIds.flatMap { tokenIds => tokenIds.map(_.toLong) }
+        val posIdsLong = decoderInputIds.flatMap { tokenIds =>
+          tokenIds.zipWithIndex.map { case (_, i) =>
+            i.toLong
+          }
+        }
+        (inpIdsLong, posIdsLong)
+      } else {
+        // Subsequent passes
+        val inpIdsLong = decoderInputIds.map { tokenIds => tokenIds.last.toLong }
+        val posIdsLong = decoderInputIds.map { tokenIds =>
+          tokenIds.zipWithIndex.map { case (_, i) =>
+            i.toLong
+          }.last
+        }
+        (inpIdsLong, posIdsLong)
+      }
+    val attentionMask: Array[Long] =
+      decoderInputIds.flatMap { tokenIds => tokenIds.map(_ => 1L) }
+
+    val batchSize: Int = decoderInputIds.length
+    val beamIdx: Array[Int] = new Array[Int](batchSize)
+    val shape: Array[Int] = Array(batchSize, inputIdsLong.length / batchSize)
+
+    val inputIdsLongTensor: org.intel.openvino.Tensor =
+      new org.intel.openvino.Tensor(shape, inputIdsLong)
+    val decoderAttentionMask: org.intel.openvino.Tensor =
+      new org.intel.openvino.Tensor(Array(batchSize, decoderInputIds.head.length), attentionMask)
+    val decoderPositionIDs: org.intel.openvino.Tensor =
+      new org.intel.openvino.Tensor(shape, inputPositionIDsLong)
+    val beamIdxTensor: org.intel.openvino.Tensor =
+      new org.intel.openvino.Tensor(Array(batchSize), beamIdx)
+
+    inferRequest.set_tensor("input_ids", inputIdsLongTensor)
+    inferRequest.set_tensor("attention_mask", decoderAttentionMask)
+    inferRequest.set_tensor("position_ids", decoderPositionIDs)
+    inferRequest.set_tensor("beam_idx", beamIdxTensor)
+
+    inferRequest.infer()
+
+    val result = inferRequest.get_tensor("logits")
+    val logitsRaw = result.data()
+
+    val sequenceLength = inputIdsLong.length / batchSize
+    val decoderOutputs = (0 until batchSize).map(i => {
+      logitsRaw
+        .slice(
+          i * sequenceLength * vocabSize + (sequenceLength - 1) * vocabSize,
+          i * sequenceLength * vocabSize + sequenceLength * vocabSize)
+    })
+    decoderOutputs.toArray
+  }
+
+  private def getDecoderOutputs(
+      inputIds: Array[Array[Int]],
+      onnxSession: (OrtSession, OrtEnvironment)): (Array[Array[Float]]) = {
+    val (session, env) = onnxSession
+
+    val inputIdsLong: Array[Array[Long]] =
+      inputIds.map { tokenIds => tokenIds.map(_.toLong) }
+
+    val inputPositionIDsLong: Array[Array[Long]] =
+      inputIds.map { tokenIds =>
+        tokenIds.zipWithIndex.map { case (_, i) =>
+          i.toLong
+        }
+      }
+
+    val inputIdsLongTensor: OnnxTensor =
+      OnnxTensor.createTensor(env, inputIdsLong)
+    val decoderAttentionMask: OnnxTensor =
+      OnnxTensor.createTensor(env, inputIdsLong.map(_.map(_ => 1L)))
+    val decoderPositionIDs: OnnxTensor =
+      OnnxTensor.createTensor(env, inputPositionIDsLong)
+
+    val decoderInputs: java.util.Map[String, OnnxTensor] = Map(
+      OnnxSignatures.decoderInputIDs -> inputIdsLongTensor,
+      OnnxSignatures.decoderAttentionMask -> decoderAttentionMask,
+      OnnxSignatures.decoderPositionIDs -> decoderPositionIDs).asJava
+    val sessionOutput = session.run(decoderInputs)
+
+    val sequenceLength = inputIds.head.length
+    val batchSize = inputIds.length
+
+//    val logits = sessionOutput.getFloatArray(OnnxSignatures.decoderOutput)
+//    inputIdsLongTensor.close()
+//    decoderPositionIDs.close()
+//    decoderAttentionMask.close()
+//    val batchLogits = logits.grouped(vocabSize).toArray
+//    batchLogits
+
+    val logitsRaw = sessionOutput.getFloatArray(OnnxSignatures.decoderOutput)
+    val decoderOutputs = (0 until batchSize).map(i => {
+      logitsRaw
+        .slice(
+          i * sequenceLength * vocabSize + (sequenceLength - 1) * vocabSize,
+          i * sequenceLength * vocabSize + sequenceLength * vocabSize)
+    })
+    decoderOutputs.toArray
+  }
+
+  /** Gets the index with the highest score
+    *
+    * @param scores
+    *   Array of Scores to max
+    * @return
+    *   Index of the highest score
+    */
+  private def argmax(scores: Array[Float]): Int =
+    scores.zipWithIndex.maxBy { case (score, _) =>
+      score
+    }._2
+  private def greedyGenerationFinished(
+      decoderIds: Seq[Array[Int]],
+      eosTokenId: Int,
+      maxOutputLength: Int): Boolean =
+    decoderIds.map(_.last).forall(_ == eosTokenId) || decoderIds.head.length == maxOutputLength
+
+  private def generateGreedyOnnx(
+      inputIds: Array[Array[Int]],
+      onnxSession: (OrtSession, OrtEnvironment),
+      maxOutputLength: Int): (Array[Array[Int]]) = {
+
+    val sequencesLength = inputIds.map(x => x.length).toArray
+    val maxSentenceLength = sequencesLength.max // - curLen
+    var generatedIds: Array[Array[Int]] = inputIds
+    while (!greedyGenerationFinished(
+        generatedIds,
+        eosTokenId,
+        maxOutputLength + maxSentenceLength)) {
+
+      val (batchLogits: Array[Array[Float]]) =
+        Array(getDecoderOutputs(generatedIds, onnxSession).last)
+
+      val nextTokenIds: Array[Int] = batchLogits.map(argmax)
+      generatedIds =
+        generatedIds.zip(nextTokenIds).map { case (currentIds: Array[Int], nextId: Int) =>
+          currentIds ++ Array(nextId)
+        }
+    }
+    generatedIds
+  }
+
+  private object OnnxSignatures {
+    val decoderInputIDs: String = "input_ids"
+    val decoderAttentionMask: String = "attention_mask"
+    val decoderPositionIDs: String = "position_ids"
+
+    // create decoder past for 32 layers of key and value eg. past_key_values.0.key and past_key_values.0.value
+    val decoderPast: Array[String] = (0 until 32)
+      .flatMap(i => Seq(s"past_key_values.$i.key", s"past_key_values.$i.value"))
+      .toArray
+    val decoderOutput: String = "logits"
+    val decoderPresent: Array[String] =
+      (0 until 32).flatMap(i => Seq(s"present.$i.key", s"present.$i.value")).toArray
+  }
+
+}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MistralTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MistralTransformer.scala
new file mode 100644
index 00000000000000..0614b7b91ffd31
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MistralTransformer.scala
@@ -0,0 +1,461 @@
+/*
+ * Copyright 2017-2022 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.johnsnowlabs.nlp.annotators.seq2seq
+
+import com.johnsnowlabs.ml.ai.util.Generation.GenerationConfig
+import com.johnsnowlabs.ml.ai.Mistral
+import com.johnsnowlabs.ml.onnx.OnnxWrapper.DecoderWrappers
+import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel}
+import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOpenvinoModel}
+import com.johnsnowlabs.ml.util.LoadExternalModel.{
+  loadJsonStringAsset,
+  loadSentencePieceAsset,
+  modelSanityCheck,
+  notSupportedEngineError
+}
+import com.johnsnowlabs.ml.util.{ONNX, Openvino}
+import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT
+import com.johnsnowlabs.nlp._
+import com.johnsnowlabs.ml.tensorflow.sentencepiece.{
+  ReadSentencePieceModel,
+  SentencePieceWrapper,
+  WriteSentencePieceModel
+}
+import com.johnsnowlabs.nlp.serialization.MapFeature
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.SparkSession
+import com.johnsnowlabs.nlp.serialization.{MapFeature, StructFeature}
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
+
+/** Mistral 7B
+  *
+  * Mistral 7B, a 7.3 billion-parameter model that stands out for its efficient and effective
+  * performance in natural language processing. Surpassing Llama 2 13B across all benchmarks and
+  * excelling over Llama 1 34B in various aspects, Mistral 7B strikes a balance between English
+  * language tasks and code comprehension, rivaling the capabilities of CodeLlama 7B in the
+  * latter.
+  *
+  * Mistral 7B introduces Grouped-query attention (GQA) for quicker inference, enhancing
+  * processing speed without compromising accuracy. This streamlined approach ensures a smoother
+  * user experience, making Mistral 7B a practical choice for real-world applications.
+  *
+  * Additionally, Mistral 7B adopts Sliding Window Attention (SWA) to efficiently handle longer
+  * sequences at a reduced computational cost. This feature enhances the model's ability to
+  * process extensive textual input, expanding its utility in handling more complex tasks.
+  *
+  * In summary, Mistral 7B represents a notable advancement in language models, offering a
+  * reliable and versatile solution for various natural language processing challenges.
+  *
+  * Pretrained models can be loaded with `pretrained` of the companion object:
+  * {{{
+  * val mistral = MistralTransformer.pretrained()
+  *   .setInputCols("document")
+  *   .setOutputCol("generation")
+  * }}}
+  * The default model is `"mistral-7b"`, if no name is provided. For available pretrained models
+  * please see the [[https://sparknlp.org/models?q=mistral Models Hub]].
+  *
+  * For extended examples of usage, see
+  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MistralTestSpec.scala MistralTestSpec]].
+  *
+  * '''References:'''
+  *   - [[https://mistral.ai/news/announcing-mistral-7b/ Mistral 7B]]
+  *   - [[https://github.com/mistralai/mistral-src]]
+  *
+  * '''Paper Abstract:'''
+  *
+  * ''We introduce Mistral 7B v0.1, a 7-billion-parameter language model engineered for superior
+  * performance and efficiency. Mistral 7B outperforms Llama 2 13B across all evaluated
+  * benchmarks, and Llama 1 34B in reasoning, mathematics, and code generation. Our model
+  * leverages grouped-query attention (GQA) for faster inference, coupled with sliding window
+  * attention (SWA) to effectively handle sequences of arbitrary length with a reduced inference
+  * cost. We also provide a model fine-tuned to follow instructions, Mistral 7B -- Instruct, that
+  * surpasses the Llama 2 13B -- Chat model both on human and automated benchmarks. Our models are
+  * released under the Apache 2.0 license.''
+  *
+  * '''Note:'''
+  *
+  * This is a very computationally expensive module especially on larger sequence. The use of an
+  * accelerator such as GPU is recommended.
+  *
+  * ==Example==
+  * {{{
+  * import spark.implicits._
+  * import com.johnsnowlabs.nlp.base.DocumentAssembler
+  * import com.johnsnowlabs.nlp.annotators.seq2seq.MistralTransformer
+  * import org.apache.spark.ml.Pipeline
+  *
+  * val documentAssembler = new DocumentAssembler()
+  *   .setInputCol("text")
+  *   .setOutputCol("documents")
+  *
+  * val mistral = MistralTransformer.pretrained("mistral-7b")
+  *   .setInputCols(Array("documents"))
+  *   .setMinOutputLength(10)
+  *   .setMaxOutputLength(50)
+  *   .setDoSample(false)
+  *   .setTopK(50)
+  *   .setNoRepeatNgramSize(3)
+  *   .setOutputCol("generation")
+  *
+  * val pipeline = new Pipeline().setStages(Array(documentAssembler, mistral))
+  *
+  * val data = Seq(
+  *   "My name is Leonardo."
+  * ).toDF("text")
+  * val result = pipeline.fit(data).transform(data)
+  *
+  * results.select("generation.result").show(truncate = false)
+  *  +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+  *  |result                                                                                                                                                                                              |
+  *  +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+  *  |[Leonardo Da Vinci invented the microscope?\n Question: Leonardo Da Vinci invented the microscope?\n Answer: No, Leonardo Da Vinci did not invent the microscope. The first microscope was invented |
+  *  | in the late 16th century, long after Leonardo']                                                                                                                                                    |
+  *  -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+  *  +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+  * }}}
+  *
+  * @param uid
+  *   required uid for storing annotator to disk
+  * @groupname anno Annotator types
+  * @groupdesc anno
+  *   Required input and expected output annotator types
+  * @groupname Ungrouped Members
+  * @groupname param Parameters
+  * @groupname setParam Parameter setters
+  * @groupname getParam Parameter getters
+  * @groupname Ungrouped Members
+  * @groupprio param  1
+  * @groupprio anno  2
+  * @groupprio Ungrouped 3
+  * @groupprio setParam  4
+  * @groupprio getParam  5
+  * @groupdesc param
+  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
+  *   parameter values through setters and getters, respectively.
+  */
+class MistralTransformer(override val uid: String)
+    extends AnnotatorModel[MistralTransformer]
+    with HasBatchedAnnotate[MistralTransformer]
+    with ParamsAndFeaturesWritable
+    with WriteOnnxModel
+    with WriteOpenvinoModel
+    with HasGeneratorProperties
+    with WriteSentencePieceModel
+    with HasEngine {
+
+  def this() = this(Identifiable.randomUID("MistralTRANSFORMER"))
+
+  /** Input annotator type : DOCUMENT
+    *
+    * @group param
+    */
+  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(DOCUMENT)
+
+  /** Output annotator type : DOCUMENT
+    *
+    * @group param
+    */
+  override val outputAnnotatorType: String = DOCUMENT
+
+  /** @group setParam */
+  def setRandomSeed(value: Int): MistralTransformer.this.type = {
+    if (randomSeed.isEmpty) {
+      this.randomSeed = Some(value)
+    }
+    this
+  }
+
+  /** A list of token ids which are ignored in the decoder's output (Default: `Array()`)
+    *
+    * @group param
+    */
+  var ignoreTokenIds = new IntArrayParam(
+    this,
+    "ignoreTokenIds",
+    "A list of token ids which are ignored in the decoder's output")
+
+  /** @group setParam */
+  def setIgnoreTokenIds(tokenIds: Array[Int]): MistralTransformer.this.type = {
+    set(ignoreTokenIds, tokenIds)
+  }
+
+  /** @group getParam */
+  def getIgnoreTokenIds: Array[Int] = $(ignoreTokenIds)
+
+  private var _model: Option[Broadcast[Mistral]] = None
+
+  val generationConfig: StructFeature[GenerationConfig] =
+    new StructFeature(this, "generationConfig").setProtected()
+
+  def setGenerationConfig(value: GenerationConfig): this.type =
+    set(generationConfig, value)
+
+  def getGenerationConfig: GenerationConfig = $$(generationConfig)
+
+  /** @group setParam */
+  def setModelIfNotSet(
+      spark: SparkSession,
+      onnxWrappers: Option[DecoderWrappers],
+      openvinoWrapper: Option[OpenvinoWrapper],
+      spp: SentencePieceWrapper): this.type = {
+    if (_model.isEmpty) {
+      _model = Some(
+        spark.sparkContext.broadcast(
+          new Mistral(
+            onnxWrappers,
+            openvinoWrapper,
+            spp = spp,
+            generationConfig = getGenerationConfig)))
+    }
+    this
+  }
+
+  /** @group getParam */
+  def getModelIfNotSet: Mistral = _model.get.value
+
+  setDefault(
+    minOutputLength -> 0,
+    maxOutputLength -> 200,
+    doSample -> false,
+    temperature -> 1,
+    topK -> 50,
+    topP -> 1,
+    repetitionPenalty -> 1.0,
+    noRepeatNgramSize -> 3,
+    ignoreTokenIds -> Array(),
+    batchSize -> 1,
+    beamSize -> 1,
+    maxInputLength -> 4096)
+
+  /** takes a document and annotations and produces new annotations of this annotator's annotation
+    * type
+    *
+    * @param batchedAnnotations
+    *   Annotations that correspond to inputAnnotationCols generated by previous annotators if any
+    * @return
+    *   any number of annotations processed for every input annotation. Not necessary one to one
+    *   relationship
+    */
+  override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = {
+
+    val allAnnotations = batchedAnnotations
+      .filter(_.nonEmpty)
+      .zipWithIndex
+      .flatMap { case (annotations, i) =>
+        annotations.filter(_.result.nonEmpty).map(x => (x, i))
+      }
+    val processedAnnotations = if (allAnnotations.nonEmpty) {
+      this.getModelIfNotSet.predict(
+        sentences = allAnnotations.map(_._1),
+        batchSize = $(batchSize),
+        minOutputLength = $(minOutputLength),
+        maxOutputLength = $(maxOutputLength),
+        doSample = $(doSample),
+        temperature = $(temperature),
+        topK = $(topK),
+        topP = $(topP),
+        repetitionPenalty = $(repetitionPenalty),
+        noRepeatNgramSize = $(noRepeatNgramSize),
+        randomSeed = this.randomSeed,
+        ignoreTokenIds = $(ignoreTokenIds),
+        beamSize = $(beamSize),
+        maxInputLength = $(maxInputLength))
+    } else {
+      Seq()
+    }
+    Seq(processedAnnotations)
+  }
+
+  override def onWrite(path: String, spark: SparkSession): Unit = {
+    super.onWrite(path, spark)
+    getEngine match {
+      case ONNX.name =>
+        val wrappers = getModelIfNotSet.onnxWrappers
+        writeOnnxModels(
+          path,
+          spark,
+          Seq((wrappers.get.decoder, "decoder_model.onnx")),
+          MistralTransformer.suffix)
+        val obj = getModelIfNotSet
+        writeSentencePieceModel(
+          path,
+          spark,
+          obj.spp,
+          MistralTransformer.suffix,
+          MistralTransformer.sppFile)
+      case Openvino.name =>
+        val wrappers = getModelIfNotSet.openvinoWrapper
+        writeOpenvinoModel(
+          path,
+          spark,
+          wrappers.get,
+          MistralTransformer.suffix,
+          MistralTransformer.openvinoFile)
+        val obj = getModelIfNotSet
+        writeSentencePieceModel(
+          path,
+          spark,
+          obj.spp,
+          MistralTransformer.suffix,
+          MistralTransformer.sppFile)
+    }
+  }
+}
+
+trait ReadablePretrainedMistralTransformerModel
+    extends ParamsAndFeaturesReadable[MistralTransformer]
+    with HasPretrained[MistralTransformer] {
+  override val defaultModelName: Some[String] = Some("mistral-7b")
+
+  /** Java compliant-overrides */
+  override def pretrained(): MistralTransformer = super.pretrained()
+
+  override def pretrained(name: String): MistralTransformer = super.pretrained(name)
+
+  override def pretrained(name: String, lang: String): MistralTransformer =
+    super.pretrained(name, lang)
+
+  override def pretrained(name: String, lang: String, remoteLoc: String): MistralTransformer =
+    super.pretrained(name, lang, remoteLoc)
+}
+
+trait ReadMistralTransformerDLModel
+    extends ReadOnnxModel
+    with ReadOpenvinoModel
+    with ReadSentencePieceModel {
+  this: ParamsAndFeaturesReadable[MistralTransformer] =>
+
+  override val onnxFile: String = "mistral_onnx"
+  val suffix: String = "_mistral"
+  override val sppFile: String = "mistral_spp"
+  override val openvinoFile: String = "mistral_openvino"
+
+  def readModel(instance: MistralTransformer, path: String, spark: SparkSession): Unit = {
+    instance.getEngine match {
+      case ONNX.name =>
+        val wrappers =
+          readOnnxModels(path, spark, Seq("decoder_model.onnx"), suffix)
+        val onnxWrappers =
+          DecoderWrappers(decoder = wrappers("decoder_model.onnx"))
+        val spp = readSentencePieceModel(path, spark, "_mistral_spp", sppFile)
+        instance.setModelIfNotSet(spark, Some(onnxWrappers), None, spp)
+      case Openvino.name =>
+        val ovWrapper =
+          readOpenvinoModel(path, spark, "_mistral_ov")
+        val spp = readSentencePieceModel(path, spark, "_mistral_spp", sppFile)
+        instance.setModelIfNotSet(spark, None, Some(ovWrapper), spp)
+      case _ =>
+        throw new Exception(notSupportedEngineError)
+    }
+  }
+
+  addReader(readModel)
+
+  def loadSavedModel(
+      modelPath: String,
+      spark: SparkSession,
+      useOpenvino: Boolean = false): MistralTransformer = {
+    implicit val formats: DefaultFormats.type = DefaultFormats // for json4
+    val (localModelPath, detectedEngine) =
+      modelSanityCheck(modelPath, isDecoder = true)
+    val modelConfig: JValue =
+      parse(loadJsonStringAsset(localModelPath, "config.json"))
+
+    val beginSuppressTokens: Array[Int] =
+      (modelConfig \ "begin_suppress_tokens").extract[Array[Int]]
+
+    val suppressTokenIds: Array[Int] =
+      (modelConfig \ "suppress_tokens").extract[Array[Int]]
+
+    val forcedDecoderIds: Array[(Int, Int)] =
+      (modelConfig \ "forced_decoder_ids").extract[Array[Array[Int]]].map {
+        case idxWithTokenId: Array[Int] if idxWithTokenId.length == 2 =>
+          (idxWithTokenId(0), idxWithTokenId(1))
+        case _ =>
+          throw new Exception(
+            "Could not extract forced_decoder_ids. Should be a list of tuples with 2 entries.")
+      }
+
+    def arrayOrNone[T](array: Array[T]): Option[Array[T]] =
+      if (array.nonEmpty) Some(array) else None
+
+    val bosTokenId = (modelConfig \ "bos_token_id").extract[Int]
+    val eosTokenId = (modelConfig \ "eos_token_id").extract[Int]
+    val padTokenId = (modelConfig \ "eos_token_id").extract[Int]
+    val vocabSize = (modelConfig \ "vocab_size").extract[Int]
+
+    val annotatorModel = new MistralTransformer()
+      .setGenerationConfig(
+        GenerationConfig(
+          bosTokenId,
+          padTokenId,
+          eosTokenId,
+          vocabSize,
+          arrayOrNone(beginSuppressTokens),
+          arrayOrNone(suppressTokenIds),
+          arrayOrNone(forcedDecoderIds)))
+    val spModel = loadSentencePieceAsset(localModelPath, "tokenizer.model")
+    val modelEngine =
+      if (useOpenvino)
+        Openvino.name
+      else
+        detectedEngine
+    annotatorModel.set(annotatorModel.engine, modelEngine)
+
+    modelEngine match {
+      case ONNX.name =>
+        val onnxWrapperDecoder =
+          OnnxWrapper.read(
+            spark,
+            localModelPath,
+            zipped = false,
+            useBundle = true,
+            modelName = "decoder_model",
+            dataFileSuffix = Some(".onnx_data"),
+            onnxFileSuffix = Some(suffix))
+
+        val onnxWrappers = DecoderWrappers(onnxWrapperDecoder)
+
+        annotatorModel
+          .setModelIfNotSet(spark, Some(onnxWrappers), None, spModel)
+
+      case Openvino.name =>
+        val openvinoWrapper =
+          OpenvinoWrapper.read(
+            spark,
+            localModelPath,
+            zipped = false,
+            useBundle = true,
+            detectedEngine = detectedEngine)
+        annotatorModel.setModelIfNotSet(spark, None, Some(openvinoWrapper), spModel)
+
+      case _ =>
+        throw new Exception(notSupportedEngineError)
+    }
+
+    annotatorModel
+  }
+
+}
+
+object MistralTransformer
+    extends ReadablePretrainedMistralTransformerModel
+    with ReadMistralTransformerDLModel
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MistralTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MistralTestSpec.scala
new file mode 100644
index 00000000000000..0a51ae130360f2
--- /dev/null
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/MistralTestSpec.scala
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2017-2023 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.johnsnowlabs.nlp.annotators.seq2seq
+
+import com.johnsnowlabs.nlp.base.DocumentAssembler
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
+import com.johnsnowlabs.tags.{FastTest, SlowTest}
+import org.apache.spark.ml.Pipeline
+import org.scalatest.flatspec.AnyFlatSpec
+
+class MistralTestSpec extends AnyFlatSpec {
+
+  "mistral-7b" should "should handle temperature=0 correctly and not crash when predicting more than 1 element with doSample=True" taggedAs SlowTest in {
+    // Even tough the Paper states temperature in interval [0,1), using temperature=0 will result in division by 0 error.
+    // Also DoSample=True may result in infinities being generated and distFiltered.length==0 which results in exception if we don't return 0 instead internally.
+    val testData = ResourceHelper.spark
+      .createDataFrame(Seq((1, "Leonardo Da Vinci invented the microscope?")))
+      .toDF("id", "text")
+      .repartition(1)
+    val documentAssembler = new DocumentAssembler()
+      .setInputCol("text")
+      .setOutputCol("documents")
+
+    val bart = MistralTransformer
+      .pretrained()
+      .setInputCols(Array("documents"))
+      .setDoSample(false)
+      .setMaxOutputLength(50)
+      .setOutputCol("generation")
+      .setBeamSize(1)
+    new Pipeline()
+      .setStages(Array(documentAssembler, bart))
+      .fit(testData)
+      .transform(testData)
+      .show(truncate = false)
+  }
+}