From 5843d407669be2e98dee9fbc8192a580e2079e2f Mon Sep 17 00:00:00 2001
From: Prabod Rathnayaka <prabod@rathnayaka.me>
Date: Wed, 28 Feb 2024 12:24:03 +0000
Subject: [PATCH 1/4] QwenTransformer scala api and tests

---
 .../scala/com/johnsnowlabs/ml/ai/Qwen.scala   | 361 +++++++++++++++
 .../annotators/seq2seq/QwenTransformer.scala  | 435 ++++++++++++++++++
 .../tokenizer/bpe/BpeSpecialTokens.scala      |   9 +
 .../tokenizer/bpe/BpeTokenizer.scala          |   7 +
 .../tokenizer/bpe/QwenTokenizer.scala         |  31 ++
 .../nlp/annotators/seq2seq/QwenTestSpec.scala |  52 +++
 6 files changed, 895 insertions(+)
 create mode 100644 src/main/scala/com/johnsnowlabs/ml/ai/Qwen.scala
 create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTransformer.scala
 create mode 100644 src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/QwenTokenizer.scala
 create mode 100644 src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTestSpec.scala

diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Qwen.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Qwen.scala
new file mode 100644
index 00000000000000..750f911225e90c
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/ml/ai/Qwen.scala
@@ -0,0 +1,361 @@
+/*
+ * Copyright 2017 - 2023  John Snow Labs
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+package com.johnsnowlabs.ml.ai
+
+import ai.onnxruntime.{OnnxTensor, OrtEnvironment, OrtSession}
+import com.johnsnowlabs.ml.ai.util.Generation.{Generate, GenerationConfig}
+import com.johnsnowlabs.ml.onnx.OnnxSession
+import com.johnsnowlabs.ml.onnx.OnnxWrapper.DecoderWrappers
+import com.johnsnowlabs.ml.onnx.TensorResources.implicits._
+import com.johnsnowlabs.ml.tensorflow.sentencepiece.SentencePieceWrapper
+import com.johnsnowlabs.nlp.Annotation
+import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT
+import com.johnsnowlabs.nlp.annotators.common.SentenceSplit
+import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.{BpeTokenizer, QwenTokenizer}
+import org.tensorflow.{Session, Tensor}
+
+import scala.collection.JavaConverters._
+
+private[johnsnowlabs] class Qwen(
+    val onnxWrappers: DecoderWrappers,
+    merges: Map[(String, String), Int],
+    vocabulary: Map[String, Int],
+    generationConfig: GenerationConfig)
+    extends Serializable
+    with Generate {
+
+  private val onnxSessionOptions: Map[String, String] = new OnnxSession().getSessionOptions
+  val bpeTokenizer: QwenTokenizer = BpeTokenizer
+    .forModel("qwen", merges = merges, vocab = vocabulary, padWithSequenceTokens = false)
+    .asInstanceOf[QwenTokenizer]
+  private val GenerationConfig(
+    bosTokenId: Int,
+    paddingTokenId: Int,
+    eosTokenId: Int,
+    vocabSize: Int,
+    beginSuppressTokens,
+    suppressTokenIds,
+    forcedDecoderIds) =
+    generationConfig
+
+  /** Decode a sequence of sentences
+    * @param sentences
+    *   Sequence of sentences
+    * @return
+    *   Sequence of decoded sentences
+    */
+  def decode(sentences: Array[Array[Int]]): Seq[String] = {
+    sentences.map(s => bpeTokenizer.decodeTokens(s.map(_.toInt)))
+  }
+
+  /** Encode a sequence of sentences
+    * @param sentences
+    *   Sequence of sentences
+    * @return
+    *   Sequence of encoded sentences
+    */
+  def encode(sentences: Seq[Annotation]): Seq[Array[Int]] = {
+    SentenceSplit
+      .unpack(sentences)
+      .map(s => {
+        val sentWithTask = s
+        bpeTokenizer
+          .tokenize(sentWithTask)
+          .map(bpeTokenizer.encode)
+          .flatMap(_.map(_.pieceId))
+      })
+  }
+
+  def tag(
+      batch: Seq[Array[Int]],
+      minOutputLength: Int,
+      maxOutputLength: Int,
+      doSample: Boolean,
+      temperature: Double,
+      topK: Int,
+      topP: Double,
+      repetitionPenalty: Double,
+      noRepeatNgramSize: Int,
+      randomSeed: Option[Long],
+      ignoreTokenIds: Array[Int] = Array(),
+      beamSize: Int,
+      maxInputLength: Int): Array[Array[Int]] = {
+    val (encoderSession, env) = onnxWrappers.decoder.getSession(onnxSessionOptions)
+    val ignoreTokenIdsInt = ignoreTokenIds
+    val expandedDecoderInputsVals = batch
+    val sequencesLength = expandedDecoderInputsVals.map(x => x.length).toArray
+    val maxSentenceLength = sequencesLength.max // - curLen
+
+    val numReturn_sequences = 1
+    // from config
+
+    var effectiveBatch_size = 1
+    var effectiveBatch_mult = 1
+
+    if (doSample) {
+      effectiveBatch_size = expandedDecoderInputsVals.length * numReturn_sequences
+      effectiveBatch_mult = numReturn_sequences
+    } else {
+      effectiveBatch_size = expandedDecoderInputsVals.length
+      effectiveBatch_mult = 1
+    }
+
+    // Run the prompt through the decoder and get the past
+//    val decoderOutputs =
+//      generateGreedyOnnx(
+//        expandedDecoderInputsVals.toArray,
+//        (encoderSession, env),
+//        maxOutputLength)
+
+    // dummy tensors for decoder encode state and attention mask
+    val decoderEncoderStateTensors = Right(OnnxTensor.createTensor(env, Array(0)))
+    val encoderAttentionMaskTensors = Right(OnnxTensor.createTensor(env, Array(1)))
+
+    // output with beam search
+    val modelOutputs = generate(
+      batch,
+      decoderEncoderStateTensors,
+      encoderAttentionMaskTensors,
+      expandedDecoderInputsVals.toArray,
+      maxOutputLength + maxSentenceLength,
+      minOutputLength,
+      doSample,
+      beamSize,
+      1,
+      temperature,
+      topK,
+      topP,
+      repetitionPenalty,
+      noRepeatNgramSize,
+      this.vocabSize,
+      this.eosTokenId,
+      this.paddingTokenId,
+      randomSeed,
+      ignoreTokenIdsInt,
+      Right((env, encoderSession)),
+      applySoftmax = false)
+
+//    decoderOutputs
+    modelOutputs
+  }
+
+  def predict(
+      sentences: Seq[Annotation],
+      batchSize: Int,
+      minOutputLength: Int,
+      maxOutputLength: Int,
+      doSample: Boolean,
+      temperature: Double,
+      topK: Int,
+      topP: Double,
+      repetitionPenalty: Double,
+      noRepeatNgramSize: Int,
+      randomSeed: Option[Long] = None,
+      ignoreTokenIds: Array[Int] = Array(),
+      beamSize: Int,
+      maxInputLength: Int): Seq[Annotation] = {
+
+    val batchDecoder = sentences.grouped(batchSize).toArray.flatMap { batch =>
+      val batchSP = encode(batch)
+      val spIds = tag(
+        batchSP,
+        minOutputLength,
+        maxOutputLength,
+        doSample,
+        temperature,
+        topK,
+        topP,
+        repetitionPenalty,
+        noRepeatNgramSize,
+        randomSeed,
+        ignoreTokenIds,
+        beamSize,
+        maxInputLength)
+
+      decode(spIds)
+
+    }
+
+    var sentBegin, nextSentEnd = 0
+    val annotations = batchDecoder.zip(sentences).map { case (content, sent) =>
+      nextSentEnd += content.length - 1
+      val annots = new Annotation(
+        annotatorType = DOCUMENT,
+        begin = sentBegin,
+        end = nextSentEnd,
+        result = content,
+        metadata = sent.metadata)
+      sentBegin += nextSentEnd + 1
+      annots
+    }
+    annotations
+  }
+
+  private def getDecoderOutputsWithPast(
+      inputIds: Array[Array[Int]],
+      decoderPast: Map[String, OnnxTensor],
+      onnxSession: (OrtSession, OrtEnvironment))
+      : (Array[Array[Float]], Map[String, OnnxTensor]) = {
+    val (session, env) = onnxSession
+
+    val lastTokens: Array[Array[Long]] =
+      inputIds.map { tokenIds =>
+        Array(tokenIds.last.toLong)
+      }
+
+    val lastTokensTensor: OnnxTensor =
+      OnnxTensor.createTensor(env, lastTokens)
+    val decoderAttentionMask: OnnxTensor =
+      OnnxTensor.createTensor(env, lastTokens.map(_.map(_ => 1L)))
+    val decoderWithPastInputs: java.util.Map[String, OnnxTensor] = (Map(
+      OnnxSignatures.decoderInputIDs -> lastTokensTensor,
+      OnnxSignatures.decoderAttentionMask -> decoderAttentionMask) ++ decoderPast).asJava
+    val sessionOutput = session.run(decoderWithPastInputs)
+    val logits = sessionOutput.getFloatArray(OnnxSignatures.decoderOutput)
+    val decoderPresent = sessionOutput.getOnnxTensors(OnnxSignatures.decoderPresent)
+    lastTokensTensor.close()
+    val batchLogits = logits.grouped(vocabSize).toArray
+    (batchLogits, decoderPresent)
+
+  }
+
+  override def getModelOutput(
+      encoderInputIds: Seq[Array[Int]],
+      decoderInputIds: Seq[Array[Int]],
+      decoderEncoderStateTensors: Either[Tensor, OnnxTensor],
+      encoderAttentionMaskTensors: Either[Tensor, OnnxTensor],
+      maxLength: Int,
+      session: Either[Session, (OrtEnvironment, OrtSession)]): Array[Array[Float]] = {
+
+    session.fold(
+      tfSession => {
+        // not implemented yet
+        Array()
+      },
+      onnxSession => {
+        val (env, decoderSession) = onnxSession
+        val decoderOutputs =
+          getDecoderOutputs(decoderInputIds.toArray, onnxSession = (decoderSession, env))
+        decoderOutputs
+      })
+
+  }
+  private def getDecoderOutputs(
+      inputIds: Array[Array[Int]],
+      onnxSession: (OrtSession, OrtEnvironment)): (Array[Array[Float]]) = {
+    val (session, env) = onnxSession
+
+    val inputIdsLong: Array[Array[Long]] =
+      inputIds.map { tokenIds => tokenIds.map(_.toLong) }
+
+    val inputPositionIDsLong: Array[Array[Long]] =
+      inputIds.map { tokenIds =>
+        tokenIds.zipWithIndex.map { case (_, i) =>
+          i.toLong
+        }
+      }
+
+    val inputIdsLongTensor: OnnxTensor =
+      OnnxTensor.createTensor(env, inputIdsLong)
+    val decoderAttentionMask: OnnxTensor =
+      OnnxTensor.createTensor(env, inputIdsLong.map(_.map(_ => 1L)))
+    val decoderPositionIDs: OnnxTensor =
+      OnnxTensor.createTensor(env, inputPositionIDsLong)
+
+    val decoderInputs: java.util.Map[String, OnnxTensor] = Map(
+      OnnxSignatures.decoderInputIDs -> inputIdsLongTensor,
+      OnnxSignatures.decoderAttentionMask -> decoderAttentionMask,
+      OnnxSignatures.decoderPositionIDs -> decoderPositionIDs).asJava
+    val sessionOutput = session.run(decoderInputs)
+
+    val sequenceLength = inputIds.head.length
+    val batchSize = inputIds.length
+
+//    val logits = sessionOutput.getFloatArray(OnnxSignatures.decoderOutput)
+//    inputIdsLongTensor.close()
+//    decoderPositionIDs.close()
+//    decoderAttentionMask.close()
+//    val batchLogits = logits.grouped(vocabSize).toArray
+//    batchLogits
+
+    val logitsRaw = sessionOutput.getFloatArray(OnnxSignatures.decoderOutput)
+    val decoderOutputs = (0 until batchSize).map(i => {
+      logitsRaw
+        .slice(
+          i * sequenceLength * vocabSize + (sequenceLength - 1) * vocabSize,
+          i * sequenceLength * vocabSize + sequenceLength * vocabSize)
+    })
+    decoderOutputs.toArray
+  }
+
+  /** Gets the index with the highest score
+    *
+    * @param scores
+    *   Array of Scores to max
+    * @return
+    *   Index of the highest score
+    */
+  private def argmax(scores: Array[Float]): Int =
+    scores.zipWithIndex.maxBy { case (score, _) =>
+      score
+    }._2
+  private def greedyGenerationFinished(
+      decoderIds: Seq[Array[Int]],
+      eosTokenId: Int,
+      maxOutputLength: Int): Boolean =
+    decoderIds.map(_.last).forall(_ == eosTokenId) || decoderIds.head.length == maxOutputLength
+
+  private def generateGreedyOnnx(
+      inputIds: Array[Array[Int]],
+      onnxSession: (OrtSession, OrtEnvironment),
+      maxOutputLength: Int): (Array[Array[Int]]) = {
+
+    val sequencesLength = inputIds.map(x => x.length).toArray
+    val maxSentenceLength = sequencesLength.max // - curLen
+    var generatedIds: Array[Array[Int]] = inputIds
+    while (!greedyGenerationFinished(
+        generatedIds,
+        eosTokenId,
+        maxOutputLength + maxSentenceLength)) {
+
+      val (batchLogits: Array[Array[Float]]) =
+        Array(getDecoderOutputs(generatedIds, onnxSession).last)
+
+      val nextTokenIds: Array[Int] = batchLogits.map(argmax)
+      generatedIds =
+        generatedIds.zip(nextTokenIds).map { case (currentIds: Array[Int], nextId: Int) =>
+          currentIds ++ Array(nextId)
+        }
+    }
+    generatedIds
+  }
+
+  private object OnnxSignatures {
+    val decoderInputIDs: String = "input_ids"
+    val decoderAttentionMask: String = "attention_mask"
+    val decoderPositionIDs: String = "position_ids"
+
+    // create decoder past for 32 layers of key and value eg. past_key_values.0.key and past_key_values.0.value
+    val decoderPast: Array[String] = (0 until 32)
+      .flatMap(i => Seq(s"past_key_values.$i.key", s"past_key_values.$i.value"))
+      .toArray
+    val decoderOutput: String = "logits"
+    val decoderPresent: Array[String] =
+      (0 until 32).flatMap(i => Seq(s"present.$i.key", s"present.$i.value")).toArray
+  }
+
+}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTransformer.scala
new file mode 100644
index 00000000000000..479d7cbb045d34
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTransformer.scala
@@ -0,0 +1,435 @@
+/*
+ * Copyright 2017-2024 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.johnsnowlabs.nlp.annotators.seq2seq
+
+import com.johnsnowlabs.ml.ai.util.Generation.GenerationConfig
+import com.johnsnowlabs.ml.ai.Qwen
+import com.johnsnowlabs.ml.onnx.OnnxWrapper.DecoderWrappers
+import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel}
+import com.johnsnowlabs.ml.util.LoadExternalModel.{
+  loadJsonStringAsset,
+  loadSentencePieceAsset,
+  loadTextAsset,
+  modelSanityCheck,
+  notSupportedEngineError
+}
+import com.johnsnowlabs.ml.util.ONNX
+import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT
+import com.johnsnowlabs.nlp._
+import com.johnsnowlabs.ml.tensorflow.sentencepiece.{
+  ReadSentencePieceModel,
+  SentencePieceWrapper,
+  WriteSentencePieceModel
+}
+import com.johnsnowlabs.nlp.serialization.MapFeature
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.SparkSession
+import com.johnsnowlabs.nlp.serialization.{MapFeature, StructFeature}
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
+
+/** Qwen: comprehensive language model series
+  *
+  * Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model
+  * pretrained on a large amount of data. In comparison with the previous released Qwen, the
+  * improvements include:
+  *
+  * 6 model sizes, including 0.5B, 1.8B, 4B, 7B, 14B, and 72B; Significant performance improvement
+  * in Chat models; Multilingual support of both base and chat models; Stable support of 32K
+  * context length for models of all sizes
+  *
+  * Qwen1.5 is a language model series including decoder language models of different model sizes.
+  * For each size, we release the base language model and the aligned chat model. It is based on
+  * the Transformer architecture with SwiGLU activation, attention QKV bias, group query
+  * attention, mixture of sliding window attention and full attention, etc. Additionally, we have
+  * an improved tokenizer adaptive to multiple natural languages and codes. For the beta version,
+  * temporarily we did not include GQA and the mixture of SWA and full attention.
+  *
+  * Pretrained models can be loaded with `pretrained` of the companion object:
+  * {{{
+  * val Qwen = QwenTransformer.pretrained()
+  *   .setInputCols("document")
+  *   .setOutputCol("generation")
+  * }}}
+  * The default model is `"Qwen-13b"`, if no name is provided. For available pretrained models
+  * please see the [[https://sparknlp.org/models?q=Qwen Models Hub]].
+  *
+  * For extended examples of usage, see
+  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTestSpec.scala QwenTestSpec]].
+  *
+  * '''References:'''
+  *   - [[https://arxiv.org/pdf/2309.16609.pdf: Qwen Technical Report]]
+  *   - [[https://qwenlm.github.io/blog/qwen1.5/]]
+  *   - [[https://github.com/QwenLM/Qwen1.5]]
+  *
+  * '''Paper Abstract:'''
+  *
+  * ''Large language models (LLMs) have revolutionized the field of artificial intelligence,
+  * enabling natural language processing tasks that were previously thought to be exclusive to
+  * humans. In this work, we introduce Qwen, the first installment of our large language model
+  * series. Qwen is a comprehensive language model series that encompasses distinct models with
+  * varying parameter counts. It includes Qwen, the base pretrained language models, and
+  * Qwen-Chat, the chat models finetuned with human alignment techniques. The base language models
+  * consistently demonstrate superior performance across a multitude of downstream tasks, and the
+  * chat models, particularly those trained using Reinforcement Learning from Human Feedback
+  * (RLHF), are highly competitive. The chat models possess advanced tool-use and planning
+  * capabilities for creating agent applications, showcasing impressive performance even when
+  * compared to bigger models on complex tasks like utilizing a code interpreter. Furthermore, we
+  * have developed coding-specialized models, Code-Qwen and Code-Qwen-Chat, as well as
+  * mathematics-focused models, Math-Qwen-Chat, which are built upon base language models. These
+  * models demonstrate significantly improved performance in comparison with open-source models,
+  * and slightly fall behind the proprietary models. ''
+  *
+  * '''Note:'''
+  *
+  * This is a very computationally expensive module especially on larger sequence. The use of an
+  * accelerator such as GPU is recommended.
+  *
+  * ==Example==
+  * {{{
+  * import spark.implicits._
+  * import com.johnsnowlabs.nlp.base.DocumentAssembler
+  * import com.johnsnowlabs.nlp.annotators.seq2seq.QwenTransformer
+  * import org.apache.spark.ml.Pipeline
+  *
+  * val documentAssembler = new DocumentAssembler()
+  *   .setInputCol("text")
+  *   .setOutputCol("documents")
+  *
+  * val Qwen = QwenTransformer.pretrained("Qwen-7b")
+  *   .setInputCols(Array("documents"))
+  *   .setMinOutputLength(10)
+  *   .setMaxOutputLength(50)
+  *   .setDoSample(false)
+  *   .setTopK(50)
+  *   .setNoRepeatNgramSize(3)
+  *   .setOutputCol("generation")
+  *
+  * val pipeline = new Pipeline().setStages(Array(documentAssembler, Qwen))
+  *
+  * val data = Seq(
+  *   "My name is Leonardo."
+  * ).toDF("text")
+  * val result = pipeline.fit(data).transform(data)
+  *
+  * results.select("generation.result").show(truncate = false)
+  * +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+  * |result                                                                                                                                                                                              |
+  * +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+  * |[ My name is Leonardo . I am a student of the University of California, Berkeley. I am interested in the field of Artificial Intelligence and its applications in the real world. I have a strong   |
+  * | passion for learning and am always looking for ways to improve my knowledge and skills]                                                                                                            |
+  * +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+  * }}}
+  *
+  * @param uid
+  *   required uid for storing annotator to disk
+  * @groupname anno Annotator types
+  * @groupdesc anno
+  *   Required input and expected output annotator types
+  * @groupname Ungrouped Members
+  * @groupname param Parameters
+  * @groupname setParam Parameter setters
+  * @groupname getParam Parameter getters
+  * @groupname Ungrouped Members
+  * @groupprio param  1
+  * @groupprio anno  2
+  * @groupprio Ungrouped 3
+  * @groupprio setParam  4
+  * @groupprio getParam  5
+  * @groupdesc param
+  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
+  *   parameter values through setters and getters, respectively.
+  */
+class QwenTransformer(override val uid: String)
+    extends AnnotatorModel[QwenTransformer]
+    with HasBatchedAnnotate[QwenTransformer]
+    with ParamsAndFeaturesWritable
+    with WriteOnnxModel
+    with HasGeneratorProperties
+    with HasEngine {
+
+  def this() = this(Identifiable.randomUID("QwenTRANSFORMER"))
+
+  /** Input annotator type : DOCUMENT
+    *
+    * @group param
+    */
+  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(DOCUMENT)
+
+  /** Output annotator type : DOCUMENT
+    *
+    * @group param
+    */
+  override val outputAnnotatorType: String = DOCUMENT
+
+  /** @group setParam */
+  def setRandomSeed(value: Int): QwenTransformer.this.type = {
+    if (randomSeed.isEmpty) {
+      this.randomSeed = Some(value)
+    }
+    this
+  }
+
+  /** A list of token ids which are ignored in the decoder's output (Default: `Array()`)
+    *
+    * @group param
+    */
+  var ignoreTokenIds = new IntArrayParam(
+    this,
+    "ignoreTokenIds",
+    "A list of token ids which are ignored in the decoder's output")
+
+  /** @group setParam */
+  def setIgnoreTokenIds(tokenIds: Array[Int]): QwenTransformer.this.type = {
+    set(ignoreTokenIds, tokenIds)
+  }
+
+  /** @group getParam */
+  def getIgnoreTokenIds: Array[Int] = $(ignoreTokenIds)
+
+  /** Vocabulary used to encode the words to ids with bpeTokenizer.encode
+    *
+    * @group param
+    */
+  val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected()
+
+  /** @group setParam */
+  def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value)
+
+  /** Holding merges.txt coming from RoBERTa model
+    *
+    * @group param
+    */
+  val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected()
+
+  /** @group setParam */
+  def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value)
+
+  private var _model: Option[Broadcast[Qwen]] = None
+
+  val generationConfig: StructFeature[GenerationConfig] =
+    new StructFeature(this, "generationConfig").setProtected()
+
+  def setGenerationConfig(value: GenerationConfig): this.type =
+    set(generationConfig, value)
+
+  def getGenerationConfig: GenerationConfig = $$(generationConfig)
+
+  /** @group setParam */
+  def setModelIfNotSet(spark: SparkSession, onnxWrappers: DecoderWrappers): this.type = {
+    if (_model.isEmpty) {
+      _model = Some(
+        spark.sparkContext.broadcast(
+          new Qwen(
+            onnxWrappers,
+            $$(merges),
+            $$(vocabulary),
+            generationConfig = getGenerationConfig)))
+    }
+    this
+  }
+
+  /** @group getParam */
+  def getModelIfNotSet: Qwen = _model.get.value
+
+  setDefault(
+    minOutputLength -> 0,
+    maxOutputLength -> 20,
+    doSample -> false,
+    temperature -> 0.6,
+    topK -> 50,
+    topP -> 0.9,
+    repetitionPenalty -> 1.0,
+    noRepeatNgramSize -> 3,
+    ignoreTokenIds -> Array(),
+    batchSize -> 1,
+    beamSize -> 1,
+    maxInputLength -> 4096)
+
+  /** takes a document and annotations and produces new annotations of this annotator's annotation
+    * type
+    *
+    * @param batchedAnnotations
+    *   Annotations that correspond to inputAnnotationCols generated by previous annotators if any
+    * @return
+    *   any number of annotations processed for every input annotation. Not necessary one to one
+    *   relationship
+    */
+  override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = {
+
+    val allAnnotations = batchedAnnotations
+      .filter(_.nonEmpty)
+      .zipWithIndex
+      .flatMap { case (annotations, i) =>
+        annotations.filter(_.result.nonEmpty).map(x => (x, i))
+      }
+    val processedAnnotations = if (allAnnotations.nonEmpty) {
+      this.getModelIfNotSet.predict(
+        sentences = allAnnotations.map(_._1),
+        batchSize = $(batchSize),
+        minOutputLength = $(minOutputLength),
+        maxOutputLength = $(maxOutputLength),
+        doSample = $(doSample),
+        temperature = $(temperature),
+        topK = $(topK),
+        topP = $(topP),
+        repetitionPenalty = $(repetitionPenalty),
+        noRepeatNgramSize = $(noRepeatNgramSize),
+        randomSeed = this.randomSeed,
+        ignoreTokenIds = $(ignoreTokenIds),
+        beamSize = $(beamSize),
+        maxInputLength = $(maxInputLength))
+    } else {
+      Seq()
+    }
+    Seq(processedAnnotations)
+  }
+
+  override def onWrite(path: String, spark: SparkSession): Unit = {
+    super.onWrite(path, spark)
+    getEngine match {
+      case ONNX.name =>
+        val wrappers = getModelIfNotSet.onnxWrappers
+        writeOnnxModels(
+          path,
+          spark,
+          Seq((wrappers.decoder, "decoder_model.onnx")),
+          QwenTransformer.suffix)
+    }
+  }
+}
+
+trait ReadablePretrainedQwenTransformerModel
+    extends ParamsAndFeaturesReadable[QwenTransformer]
+    with HasPretrained[QwenTransformer] {
+  override val defaultModelName: Some[String] = Some("Qwen-7b")
+
+  /** Java compliant-overrides */
+  override def pretrained(): QwenTransformer = super.pretrained()
+
+  override def pretrained(name: String): QwenTransformer = super.pretrained(name)
+
+  override def pretrained(name: String, lang: String): QwenTransformer =
+    super.pretrained(name, lang)
+
+  override def pretrained(name: String, lang: String, remoteLoc: String): QwenTransformer =
+    super.pretrained(name, lang, remoteLoc)
+}
+
+trait ReadQwenTransformerDLModel extends ReadOnnxModel {
+  this: ParamsAndFeaturesReadable[QwenTransformer] =>
+
+  override val onnxFile: String = "qwen_onnx"
+  val suffix: String = "_qwen"
+
+  def readModel(instance: QwenTransformer, path: String, spark: SparkSession): Unit = {
+    instance.getEngine match {
+      case ONNX.name =>
+        val wrappers =
+          readOnnxModels(path, spark, Seq("decoder_model.onnx"), suffix)
+        val onnxWrappers =
+          DecoderWrappers(decoder = wrappers("decoder_model.onnx"))
+        instance.setModelIfNotSet(spark, onnxWrappers)
+      case _ =>
+        throw new Exception(notSupportedEngineError)
+    }
+  }
+
+  addReader(readModel)
+
+  def loadSavedModel(modelPath: String, spark: SparkSession): QwenTransformer = {
+    implicit val formats: DefaultFormats.type = DefaultFormats // for json4
+    val (localModelPath, detectedEngine) =
+      modelSanityCheck(modelPath, isDecoder = true)
+    val modelConfig: JValue =
+      parse(loadJsonStringAsset(localModelPath, "config.json"))
+
+    val beginSuppressTokens: Array[Int] =
+      (modelConfig \ "begin_suppress_tokens").extract[Array[Int]]
+
+    val suppressTokenIds: Array[Int] =
+      (modelConfig \ "suppress_tokens").extract[Array[Int]]
+
+    val forcedDecoderIds: Array[(Int, Int)] =
+      (modelConfig \ "forced_decoder_ids").extract[Array[Array[Int]]].map {
+        case idxWithTokenId: Array[Int] if idxWithTokenId.length == 2 =>
+          (idxWithTokenId(0), idxWithTokenId(1))
+        case _ =>
+          throw new Exception(
+            "Could not extract forced_decoder_ids. Should be a list of tuples with 2 entries.")
+      }
+
+    def arrayOrNone[T](array: Array[T]): Option[Array[T]] =
+      if (array.nonEmpty) Some(array) else None
+
+    val bosTokenId = (modelConfig \ "bos_token_id").extract[Int]
+    val eosTokenId = (modelConfig \ "eos_token_id").extract[Int]
+    val padTokenId = (modelConfig \ "eos_token_id").extract[Int]
+    val vocabSize = (modelConfig \ "vocab_size").extract[Int]
+
+    val vocabs = loadTextAsset(localModelPath, "vocab.txt").zipWithIndex.toMap
+
+    val bytePairs = loadTextAsset(localModelPath, "merges.txt")
+      .map(_.split(" "))
+      .filter(w => w.length == 2)
+      .map { case Array(c1, c2) => (c1, c2) }
+      .zipWithIndex
+      .toMap
+
+    val annotatorModel = new QwenTransformer()
+      .setGenerationConfig(
+        GenerationConfig(
+          bosTokenId,
+          padTokenId,
+          eosTokenId,
+          vocabSize,
+          arrayOrNone(beginSuppressTokens),
+          arrayOrNone(suppressTokenIds),
+          arrayOrNone(forcedDecoderIds)))
+      .setVocabulary(vocabs)
+      .setMerges(bytePairs)
+
+    annotatorModel.set(annotatorModel.engine, detectedEngine)
+
+    detectedEngine match {
+      case ONNX.name =>
+        val onnxWrapperDecoder =
+          OnnxWrapper.read(
+            localModelPath,
+            zipped = false,
+            useBundle = true,
+            modelName = "decoder_model")
+
+        val onnxWrappers = DecoderWrappers(onnxWrapperDecoder)
+
+        annotatorModel
+          .setModelIfNotSet(spark, onnxWrappers)
+
+      case _ =>
+        throw new Exception(notSupportedEngineError)
+    }
+
+    annotatorModel
+  }
+
+}
+
+object QwenTransformer
+    extends ReadablePretrainedQwenTransformerModel
+    with ReadQwenTransformerDLModel
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala
index e7a15439eb47e8..5bab008aac220f 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala
@@ -153,6 +153,15 @@ private[johnsnowlabs] object SpecialTokens {
           unkTokenString = "<|endoftext|>",
           maskTokenString = "<|endoftext|>",
           padTokenString = "<|endoftext|>")
+      case "qwen" =>
+        SpecialTokens(
+          vocab,
+          startTokenString = "<|im_start|>",
+          endTokenString = "<|im_end|>",
+          unkTokenString = "<|endoftext|>",
+          maskTokenString = "<|endoftext|>",
+          padTokenString = "<|endoftext|>")
+
     }
 }
 
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala
index a75457758dc813..d1538ca4d2ac97 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala
@@ -361,6 +361,13 @@ object BpeTokenizer {
           modelSpecialTokens(),
           padWithSequenceTokens,
           addPrefixSpaceToSentence = addPrefixSpaceToSentence)
+      case "qwen" =>
+        new QwenTokenizer(
+          merges,
+          vocab,
+          modelSpecialTokens(),
+          padWithSequenceTokens,
+          addPrefixSpaceToSentence = addPrefixSpaceToSentence)
       case _ =>
         throw new IllegalArgumentException("Model type \"" + modelType + "\" not supported yet.")
     }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/QwenTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/QwenTokenizer.scala
new file mode 100644
index 00000000000000..b790a28494b790
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/QwenTokenizer.scala
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2017-2023 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.johnsnowlabs.nlp.annotators.tokenizer.bpe
+
+class QwenTokenizer(
+    merges: Map[(String, String), Int],
+    vocab: Map[String, Int],
+    specialTokens: SpecialTokens,
+    padWithSequenceTokens: Boolean = false,
+    addPrefixSpaceToSentence: Boolean = false)
+    extends Gpt2Tokenizer(
+      merges,
+      vocab,
+      specialTokens,
+      padWithSequenceTokens,
+      prependString = "Ġ",
+      addPrefixSpaceToSentence)
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTestSpec.scala
new file mode 100644
index 00000000000000..8a0df4439a1f8c
--- /dev/null
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTestSpec.scala
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2017-2023 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.johnsnowlabs.nlp.annotators.seq2seq
+
+import com.johnsnowlabs.nlp.base.DocumentAssembler
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
+import com.johnsnowlabs.tags.{FastTest, SlowTest}
+import org.apache.spark.ml.Pipeline
+import org.scalatest.flatspec.AnyFlatSpec
+
+class QwenTestSpec extends AnyFlatSpec {
+
+  "phi2" should "should handle temperature=0 correctly and not crash when predicting more than 1 element with doSample=True" taggedAs SlowTest in {
+    // Even tough the Paper states temperature in interval [0,1), using temperature=0 will result in division by 0 error.
+    // Also DoSample=True may result in infinities being generated and distFiltered.length==0 which results in exception if we don't return 0 instead internally.
+    val testData = ResourceHelper.spark
+      .createDataFrame(Seq((1, "My name is Leonardo.")))
+      .toDF("id", "text")
+      .repartition(1)
+    val documentAssembler = new DocumentAssembler()
+      .setInputCol("text")
+      .setOutputCol("documents")
+
+    val bart = QwenTransformer
+      .pretrained()
+      .setInputCols(Array("documents"))
+      .setDoSample(false)
+      .setMaxOutputLength(50)
+      .setOutputCol("generation")
+      .setBeamSize(1)
+    new Pipeline()
+      .setStages(Array(documentAssembler, bart))
+      .fit(testData)
+      .transform(testData)
+      .show(truncate = false)
+
+  }
+}

From 5e68c41a3b43ef259ba0f3e7c840aa7768194635 Mon Sep 17 00:00:00 2001
From: Prabod Rathnayaka <prabod@rathnayaka.me>
Date: Thu, 29 Feb 2024 12:50:18 +0000
Subject: [PATCH 2/4] QwenTransformer python api and tests

---
 python/sparknlp/annotator/seq2seq/__init__.py |   1 +
 .../annotator/seq2seq/qwen_transformer.py     | 339 ++++++++++++++++++
 python/sparknlp/internal/__init__.py          |   5 +
 .../seq2seq/qwen_transformer_test.py          |  47 +++
 .../nlp/annotators/seq2seq/QwenTestSpec.scala |   9 +-
 5 files changed, 397 insertions(+), 4 deletions(-)
 create mode 100644 python/sparknlp/annotator/seq2seq/qwen_transformer.py
 create mode 100644 python/test/annotator/seq2seq/qwen_transformer_test.py

diff --git a/python/sparknlp/annotator/seq2seq/__init__.py b/python/sparknlp/annotator/seq2seq/__init__.py
index 76e34a8c774969..ddf3c4dd083fef 100644
--- a/python/sparknlp/annotator/seq2seq/__init__.py
+++ b/python/sparknlp/annotator/seq2seq/__init__.py
@@ -21,3 +21,4 @@
 from sparknlp.annotator.seq2seq.m2m100_transformer import *
 from sparknlp.annotator.seq2seq.phi2_transformer import *
 from sparknlp.annotator.seq2seq.mistral_transformer import *
+from sparknlp.annotator.seq2seq.qwen_transformer import *
diff --git a/python/sparknlp/annotator/seq2seq/qwen_transformer.py b/python/sparknlp/annotator/seq2seq/qwen_transformer.py
new file mode 100644
index 00000000000000..e0ae31f6dfa9f4
--- /dev/null
+++ b/python/sparknlp/annotator/seq2seq/qwen_transformer.py
@@ -0,0 +1,339 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the QwenTransformer."""
+
+from sparknlp.common import *
+
+
+class QwenTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
+    """Qwen: comprehensive language model series
+
+   Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model
+   pretrained on a large amount of data. In comparison with the previous released Qwen, the
+   improvements include:
+
+   6 model sizes, including 0.5B, 1.8B, 4B, 7B, 14B, and 72B; Significant performance improvement
+   in Chat models; Multilingual support of both base and chat models; Stable support of 32K
+   context length for models of all sizes
+
+   Qwen1.5 is a language model series including decoder language models of different model sizes.
+   For each size, we release the base language model and the aligned chat model. It is based on
+   the Transformer architecture with SwiGLU activation, attention QKV bias, group query
+   attention, mixture of sliding window attention and full attention, etc. Additionally, we have
+   an improved tokenizer adaptive to multiple natural languages and codes. For the beta version,
+   temporarily we did not include GQA and the mixture of SWA and full attention.
+
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+
+    >>> qwen = QwenTransformer.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("generation")
+
+
+    The default model is ``"qwen-13b"``, if no name is provided. For available
+    pretrained models please see the `Models Hub
+    <https://sparknlp.org/models?q=qwen>`__.
+
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``           ``DOCUMENT``
+    ====================== ======================
+
+    Parameters
+    ----------
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    minOutputLength
+        Minimum length of the sequence to be generated, by default 0
+    maxOutputLength
+        Maximum length of output text, by default 20
+    doSample
+        Whether or not to use sampling; use greedy decoding otherwise, by default False
+    temperature
+        The value used to module the next token probabilities, by default 1.0
+    topK
+        The number of highest probability vocabulary tokens to keep for
+        top-k-filtering, by default 50
+    topP
+        Top cumulative probability for vocabulary tokens, by default 1.0
+
+        If set to float < 1, only the most probable tokens with probabilities
+        that add up to ``topP`` or higher are kept for generation.
+    repetitionPenalty
+        The parameter for repetition penalty, 1.0 means no penalty. , by default
+        1.0
+    noRepeatNgramSize
+        If set to int > 0, all ngrams of that size can only occur once, by
+        default 0
+    ignoreTokenIds
+        A list of token ids which are ignored in the decoder's output, by
+        default []
+
+    Notes
+    -----
+    This is a very computationally expensive module especially on larger
+    sequence. The use of an accelerator such as GPU is recommended.
+
+    References
+    ----------
+    - `Qwen Technical Report
+      <https://arxiv.org/pdf/2309.16609.pdf>`__
+    - https://qwenlm.github.io/blog/qwen1.5/
+    - https://github.com/QwenLM/Qwen1.5
+
+    **Paper Abstract:**
+
+    *Large language models (LLMs) have revolutionized the field of artificial intelligence,
+    enabling natural language processing tasks that were previously thought to be exclusive to
+    humans. In this work, we introduce Qwen, the first installment of our large language model
+    series. Qwen is a comprehensive language model series that encompasses distinct models with
+    varying parameter counts. It includes Qwen, the base pretrained language models, and
+    Qwen-Chat, the chat models finetuned with human alignment techniques. The base language models
+    consistently demonstrate superior performance across a multitude of downstream tasks, and the
+    chat models, particularly those trained using Reinforcement Learning from Human Feedback
+    (RLHF), are highly competitive. The chat models possess advanced tool-use and planning
+    capabilities for creating agent applications, showcasing impressive performance even when
+    compared to bigger models on complex tasks like utilizing a code interpreter. Furthermore, we
+    have developed coding-specialized models, Code-Qwen and Code-Qwen-Chat, as well as
+    mathematics-focused models, Math-Qwen-Chat, which are built upon base language models. These
+    models demonstrate significantly improved performance in comparison with open-source models,
+    and slightly fall behind the proprietary models.*
+
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("documents")
+    >>> qwen = QwenTransformer.pretrained("qwen-7b") \\
+    ...     .setInputCols(["documents"]) \\
+    ...     .setMaxOutputLength(50) \\
+    ...     .setOutputCol("generation")
+    >>> pipeline = Pipeline().setStages([documentAssembler, qwen])
+    >>> data = spark.createDataFrame([["My name is Leonardo."]]).toDF("text")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.select("summaries.generation").show(truncate=False)
+    +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    |result                                                                                                                                                                                              |
+    +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    |[My name is Leonardo . I am a student of the University of California, Berkeley. I am interested in the field of Artificial Intelligence and its applications in the real world. I have a strong    |
+    | passion for learning and am always looking for ways to improve my knowledge and skills]                                                                                                            |
+    -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    """
+
+    name = "QwenTransformer"
+
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
+
+    outputAnnotatorType = AnnotatorType.DOCUMENT
+
+    configProtoBytes = Param(Params._dummy(), "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+
+    minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
+                            typeConverter=TypeConverters.toInt)
+
+    maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
+                            typeConverter=TypeConverters.toInt)
+
+    doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
+                     typeConverter=TypeConverters.toBoolean)
+
+    temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
+                        typeConverter=TypeConverters.toFloat)
+
+    topK = Param(Params._dummy(), "topK",
+                 "The number of highest probability vocabulary tokens to keep for top-k-filtering",
+                 typeConverter=TypeConverters.toInt)
+
+    topP = Param(Params._dummy(), "topP",
+                 "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
+                 typeConverter=TypeConverters.toFloat)
+
+    repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
+                              "The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
+                              typeConverter=TypeConverters.toFloat)
+
+    noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
+                              "If set to int > 0, all ngrams of that size can only occur once",
+                              typeConverter=TypeConverters.toInt)
+
+    ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
+                           "A list of token ids which are ignored in the decoder's output",
+                           typeConverter=TypeConverters.toListInt)
+
+    def setIgnoreTokenIds(self, value):
+        """A list of token ids which are ignored in the decoder's output.
+
+        Parameters
+        ----------
+        value : List[int]
+            The words to be filtered out
+        """
+        return self._set(ignoreTokenIds=value)
+
+    def setConfigProtoBytes(self, b):
+        """Sets configProto from tensorflow, serialized into byte array.
+
+        Parameters
+        ----------
+        b : List[int]
+            ConfigProto from tensorflow, serialized into byte array
+        """
+        return self._set(configProtoBytes=b)
+
+    def setMinOutputLength(self, value):
+        """Sets minimum length of the sequence to be generated.
+
+        Parameters
+        ----------
+        value : int
+            Minimum length of the sequence to be generated
+        """
+        return self._set(minOutputLength=value)
+
+    def setMaxOutputLength(self, value):
+        """Sets maximum length of output text.
+
+        Parameters
+        ----------
+        value : int
+            Maximum length of output text
+        """
+        return self._set(maxOutputLength=value)
+
+    def setDoSample(self, value):
+        """Sets whether or not to use sampling, use greedy decoding otherwise.
+
+        Parameters
+        ----------
+        value : bool
+            Whether or not to use sampling; use greedy decoding otherwise
+        """
+        return self._set(doSample=value)
+
+    def setTemperature(self, value):
+        """Sets the value used to module the next token probabilities.
+
+        Parameters
+        ----------
+        value : float
+            The value used to module the next token probabilities
+        """
+        return self._set(temperature=value)
+
+    def setTopK(self, value):
+        """Sets the number of highest probability vocabulary tokens to keep for
+        top-k-filtering.
+
+        Parameters
+        ----------
+        value : int
+            Number of highest probability vocabulary tokens to keep
+        """
+        return self._set(topK=value)
+
+    def setTopP(self, value):
+        """Sets the top cumulative probability for vocabulary tokens.
+
+        If set to float < 1, only the most probable tokens with probabilities
+        that add up to ``topP`` or higher are kept for generation.
+
+        Parameters
+        ----------
+        value : float
+            Cumulative probability for vocabulary tokens
+        """
+        return self._set(topP=value)
+
+    def setRepetitionPenalty(self, value):
+        """Sets the parameter for repetition penalty. 1.0 means no penalty.
+
+        Parameters
+        ----------
+        value : float
+            The repetition penalty
+
+        References
+        ----------
+        See `Ctrl: A Conditional Transformer Language Model For Controllable
+        Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
+        """
+        return self._set(repetitionPenalty=value)
+
+    def setNoRepeatNgramSize(self, value):
+        """Sets size of n-grams that can only occur once.
+
+        If set to int > 0, all ngrams of that size can only occur once.
+
+        Parameters
+        ----------
+        value : int
+            N-gram size can only occur once
+        """
+        return self._set(noRepeatNgramSize=value)
+
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.QwenTransformer", java_model=None):
+        super(QwenTransformer, self).__init__(classname=classname, java_model=java_model)
+        self._setDefault(minOutputLength=0, maxOutputLength=50, doSample=False, temperature=0.6, topK=50, topP=0.9,
+            repetitionPenalty=1.0, noRepeatNgramSize=0, ignoreTokenIds=[], batchSize=1)
+
+    @staticmethod
+    def loadSavedModel(folder, spark_session):
+        """Loads a locally saved model.
+
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+
+        Returns
+        -------
+        QwenTransformer
+            The restored model
+        """
+        from sparknlp.internal import _QwenLoader
+        jModel = _QwenLoader(folder, spark_session._jsparkSession)._java_obj
+        return QwenTransformer(java_model=jModel)
+
+    @staticmethod
+    def pretrained(name="qwen-7b", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default "qwen-7b"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+
+        Returns
+        -------
+        QwenTransformer
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(QwenTransformer, name, lang, remote_loc)
diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py
index deeff9c5189f52..68644b292d0ba8 100644
--- a/python/sparknlp/internal/__init__.py
+++ b/python/sparknlp/internal/__init__.py
@@ -378,6 +378,11 @@ def __init__(self, path, jspark, useCache):
             useCache,
         )
 
+class _QwenLoader(ExtendedJavaWrapper):
+    def __init__(self, path, jspark):
+        super(_QwenLoader, self).__init__(
+            "com.johnsnowlabs.nlp.annotators.seq2seq.QwenTransformer.loadSavedModel", path, jspark)
+
 
 class _USELoader(ExtendedJavaWrapper):
     def __init__(self, path, jspark, loadsp):
diff --git a/python/test/annotator/seq2seq/qwen_transformer_test.py b/python/test/annotator/seq2seq/qwen_transformer_test.py
new file mode 100644
index 00000000000000..05a063857cc900
--- /dev/null
+++ b/python/test/annotator/seq2seq/qwen_transformer_test.py
@@ -0,0 +1,47 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import unittest
+
+import pytest
+
+from sparknlp.annotator import *
+from sparknlp.base import *
+from test.util import SparkContextForTest
+
+
+@pytest.mark.fast
+class QwenTransformerTextGenerationTestSpec(unittest.TestCase):
+    def setUp(self):
+        self.spark = SparkContextForTest.spark
+
+    def runTest(self):
+        data = self.spark.createDataFrame([
+            [1, """system\nYou are a helpful assistant.\nuser\nGive me a short introduction to large language model.\nassistant\n""".strip().replace("\n", " ")]]).toDF("id", "text")
+
+        document_assembler = DocumentAssembler() \
+            .setInputCol("text") \
+            .setOutputCol("documents")
+
+        llama2 = QwenTransformer \
+            .pretrained() \
+            .setMaxOutputLength(50) \
+            .setDoSample(False) \
+            .setInputCols(["documents"]) \
+            .setOutputCol("generation")
+
+        pipeline = Pipeline().setStages([document_assembler, llama2])
+        results = pipeline.fit(data).transform(data)
+
+        results.select("generation.result").show(truncate=False)
+
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTestSpec.scala
index 8a0df4439a1f8c..d043a41ce16372 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTestSpec.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTestSpec.scala
@@ -24,18 +24,19 @@ import org.scalatest.flatspec.AnyFlatSpec
 
 class QwenTestSpec extends AnyFlatSpec {
 
-  "phi2" should "should handle temperature=0 correctly and not crash when predicting more than 1 element with doSample=True" taggedAs SlowTest in {
+  "qwen" should "should handle temperature=0 correctly and not crash when predicting more than 1 element with doSample=True" taggedAs SlowTest in {
     // Even tough the Paper states temperature in interval [0,1), using temperature=0 will result in division by 0 error.
     // Also DoSample=True may result in infinities being generated and distFiltered.length==0 which results in exception if we don't return 0 instead internally.
     val testData = ResourceHelper.spark
-      .createDataFrame(Seq((1, "My name is Leonardo.")))
+      .createDataFrame(Seq(
+        (1, "system\\nYou are a helpful assistant.\\nuser\\nGive me a short introduction to large language model.\\nassistant\\n")))
       .toDF("id", "text")
       .repartition(1)
     val documentAssembler = new DocumentAssembler()
       .setInputCol("text")
       .setOutputCol("documents")
 
-    val bart = QwenTransformer
+    val qwen = QwenTransformer
       .pretrained()
       .setInputCols(Array("documents"))
       .setDoSample(false)
@@ -43,7 +44,7 @@ class QwenTestSpec extends AnyFlatSpec {
       .setOutputCol("generation")
       .setBeamSize(1)
     new Pipeline()
-      .setStages(Array(documentAssembler, bart))
+      .setStages(Array(documentAssembler, qwen))
       .fit(testData)
       .transform(testData)
       .show(truncate = false)

From 4e6b443766ea1a8cc05b60551c7f71230548f1f5 Mon Sep 17 00:00:00 2001
From: Prabod Rathnayaka <prabod@rathnayaka.me>
Date: Thu, 29 Feb 2024 13:14:32 +0000
Subject: [PATCH 3/4] QwenTransformer python api and tests

---
 python/test/annotator/seq2seq/qwen_transformer_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/test/annotator/seq2seq/qwen_transformer_test.py b/python/test/annotator/seq2seq/qwen_transformer_test.py
index 05a063857cc900..72c9bc675cf99d 100644
--- a/python/test/annotator/seq2seq/qwen_transformer_test.py
+++ b/python/test/annotator/seq2seq/qwen_transformer_test.py
@@ -20,7 +20,7 @@
 from test.util import SparkContextForTest
 
 
-@pytest.mark.fast
+@pytest.mark.slow
 class QwenTransformerTextGenerationTestSpec(unittest.TestCase):
     def setUp(self):
         self.spark = SparkContextForTest.spark

From 4c83df7da8630388315d7bc662a02fcc8839c009 Mon Sep 17 00:00:00 2001
From: Prabod Rathnayaka <prabod@rathnayaka.me>
Date: Wed, 17 Jul 2024 09:22:58 +0000
Subject: [PATCH 4/4] Added Openvino support

---
 .../annotator/seq2seq/qwen_transformer.py     |   4 +-
 python/sparknlp/internal/__init__.py          |   4 +-
 .../scala/com/johnsnowlabs/ml/ai/Qwen.scala   | 145 +++++++++++++++---
 .../annotators/seq2seq/QwenTransformer.scala  |  61 ++++++--
 4 files changed, 175 insertions(+), 39 deletions(-)

diff --git a/python/sparknlp/annotator/seq2seq/qwen_transformer.py b/python/sparknlp/annotator/seq2seq/qwen_transformer.py
index e0ae31f6dfa9f4..27ece0e914dde1 100644
--- a/python/sparknlp/annotator/seq2seq/qwen_transformer.py
+++ b/python/sparknlp/annotator/seq2seq/qwen_transformer.py
@@ -297,7 +297,7 @@ def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.QwenTransf
             repetitionPenalty=1.0, noRepeatNgramSize=0, ignoreTokenIds=[], batchSize=1)
 
     @staticmethod
-    def loadSavedModel(folder, spark_session):
+    def loadSavedModel(folder, spark_session, use_openvino=False):
         """Loads a locally saved model.
 
         Parameters
@@ -313,7 +313,7 @@ def loadSavedModel(folder, spark_session):
             The restored model
         """
         from sparknlp.internal import _QwenLoader
-        jModel = _QwenLoader(folder, spark_session._jsparkSession)._java_obj
+        jModel = _QwenLoader(folder, spark_session._jsparkSession, use_openvino)._java_obj
         return QwenTransformer(java_model=jModel)
 
     @staticmethod
diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py
index 68644b292d0ba8..49e9756f3eed04 100644
--- a/python/sparknlp/internal/__init__.py
+++ b/python/sparknlp/internal/__init__.py
@@ -379,9 +379,9 @@ def __init__(self, path, jspark, useCache):
         )
 
 class _QwenLoader(ExtendedJavaWrapper):
-    def __init__(self, path, jspark):
+    def __init__(self, path, jspark, use_openvino=False):
         super(_QwenLoader, self).__init__(
-            "com.johnsnowlabs.nlp.annotators.seq2seq.QwenTransformer.loadSavedModel", path, jspark)
+            "com.johnsnowlabs.nlp.annotators.seq2seq.QwenTransformer.loadSavedModel", path, jspark, use_openvino)
 
 
 class _USELoader(ExtendedJavaWrapper):
diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/Qwen.scala b/src/main/scala/com/johnsnowlabs/ml/ai/Qwen.scala
index 750f911225e90c..ca2c597f2987f3 100644
--- a/src/main/scala/com/johnsnowlabs/ml/ai/Qwen.scala
+++ b/src/main/scala/com/johnsnowlabs/ml/ai/Qwen.scala
@@ -21,17 +21,21 @@ import com.johnsnowlabs.ml.ai.util.Generation.{Generate, GenerationConfig}
 import com.johnsnowlabs.ml.onnx.OnnxSession
 import com.johnsnowlabs.ml.onnx.OnnxWrapper.DecoderWrappers
 import com.johnsnowlabs.ml.onnx.TensorResources.implicits._
+import com.johnsnowlabs.ml.openvino.OpenvinoWrapper
 import com.johnsnowlabs.ml.tensorflow.sentencepiece.SentencePieceWrapper
+import com.johnsnowlabs.ml.util.{ONNX, Openvino, TensorFlow}
 import com.johnsnowlabs.nlp.Annotation
 import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT
 import com.johnsnowlabs.nlp.annotators.common.SentenceSplit
 import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.{BpeTokenizer, QwenTokenizer}
+import org.intel.openvino.InferRequest
 import org.tensorflow.{Session, Tensor}
 
 import scala.collection.JavaConverters._
 
 private[johnsnowlabs] class Qwen(
-    val onnxWrappers: DecoderWrappers,
+    val onnxWrappers: Option[DecoderWrappers],
+    val openvinoWrapper: Option[OpenvinoWrapper],
     merges: Map[(String, String), Int],
     vocabulary: Map[String, Int],
     generationConfig: GenerationConfig)
@@ -39,6 +43,11 @@ private[johnsnowlabs] class Qwen(
     with Generate {
 
   private val onnxSessionOptions: Map[String, String] = new OnnxSession().getSessionOptions
+  val detectedEngine: String =
+    if (onnxWrappers.isDefined) ONNX.name
+    else if (openvinoWrapper.isDefined) Openvino.name
+    else ONNX.name
+  private var nextPositionId: Option[Array[Long]] = None
   val bpeTokenizer: QwenTokenizer = BpeTokenizer
     .forModel("qwen", merges = merges, vocab = vocabulary, padWithSequenceTokens = false)
     .asInstanceOf[QwenTokenizer]
@@ -93,8 +102,8 @@ private[johnsnowlabs] class Qwen(
       randomSeed: Option[Long],
       ignoreTokenIds: Array[Int] = Array(),
       beamSize: Int,
-      maxInputLength: Int): Array[Array[Int]] = {
-    val (encoderSession, env) = onnxWrappers.decoder.getSession(onnxSessionOptions)
+      maxInputLength: Int,
+      stopTokenIds: Array[Int]): Array[Array[Int]] = {
     val ignoreTokenIdsInt = ignoreTokenIds
     val expandedDecoderInputsVals = batch
     val sequencesLength = expandedDecoderInputsVals.map(x => x.length).toArray
@@ -121,10 +130,23 @@ private[johnsnowlabs] class Qwen(
 //        (encoderSession, env),
 //        maxOutputLength)
 
-    // dummy tensors for decoder encode state and attention mask
-    val decoderEncoderStateTensors = Right(OnnxTensor.createTensor(env, Array(0)))
-    val encoderAttentionMaskTensors = Right(OnnxTensor.createTensor(env, Array(1)))
-
+    val (decoderEncoderStateTensors, encoderAttentionMaskTensors, session) =
+      detectedEngine match {
+        case ONNX.name =>
+          // dummy tensors for decoder encode state and attention mask
+          val (encoderSession, env) = onnxWrappers.get.decoder.getSession(onnxSessionOptions)
+          (
+            Right(OnnxTensor.createTensor(env, Array(0))),
+            Right(OnnxTensor.createTensor(env, Array(1))),
+            Right((env, encoderSession)))
+        case Openvino.name =>
+          // not needed
+          (null, null, null)
+      }
+    val ovInferRequest: Option[InferRequest] = detectedEngine match {
+      case ONNX.name => None
+      case Openvino.name => Some(openvinoWrapper.get.getCompiledModel().create_infer_request())
+    }
     // output with beam search
     val modelOutputs = generate(
       batch,
@@ -146,8 +168,10 @@ private[johnsnowlabs] class Qwen(
       this.paddingTokenId,
       randomSeed,
       ignoreTokenIdsInt,
-      Right((env, encoderSession)),
-      applySoftmax = false)
+      session,
+      applySoftmax = false,
+      ovInferRequest = ovInferRequest,
+      stopTokenIds = stopTokenIds)
 
 //    decoderOutputs
     modelOutputs
@@ -167,7 +191,8 @@ private[johnsnowlabs] class Qwen(
       randomSeed: Option[Long] = None,
       ignoreTokenIds: Array[Int] = Array(),
       beamSize: Int,
-      maxInputLength: Int): Seq[Annotation] = {
+      maxInputLength: Int,
+      stopTokenIds: Array[Int]): Seq[Annotation] = {
 
     val batchDecoder = sentences.grouped(batchSize).toArray.flatMap { batch =>
       val batchSP = encode(batch)
@@ -184,7 +209,8 @@ private[johnsnowlabs] class Qwen(
         randomSeed,
         ignoreTokenIds,
         beamSize,
-        maxInputLength)
+        maxInputLength,
+        stopTokenIds)
 
       decode(spIds)
 
@@ -239,20 +265,76 @@ private[johnsnowlabs] class Qwen(
       decoderEncoderStateTensors: Either[Tensor, OnnxTensor],
       encoderAttentionMaskTensors: Either[Tensor, OnnxTensor],
       maxLength: Int,
-      session: Either[Session, (OrtEnvironment, OrtSession)]): Array[Array[Float]] = {
+      session: Either[Session, (OrtEnvironment, OrtSession)],
+      ovInferRequest: Option[InferRequest]): Array[Array[Float]] = {
 
-    session.fold(
-      tfSession => {
+    detectedEngine match {
+      case TensorFlow.name =>
         // not implemented yet
         Array()
-      },
-      onnxSession => {
-        val (env, decoderSession) = onnxSession
+      case ONNX.name =>
+        val (env, decoderSession) = session.right.get
         val decoderOutputs =
           getDecoderOutputs(decoderInputIds.toArray, onnxSession = (decoderSession, env))
         decoderOutputs
-      })
+      case Openvino.name =>
+        val decoderOutputs =
+          getDecoderOutputsOv(decoderInputIds.toArray, ovInferRequest.get)
+        decoderOutputs
+    }
+  }
 
+  private def getDecoderOutputsOv(
+      inputIds: Array[Array[Int]],
+      inferRequest: InferRequest): (Array[Array[Float]]) = {
+    val (inputIdsLong, inputPositionIDsLong): (Array[Long], Array[Long]) =
+      if (nextPositionId.isDefined) {
+        val inpIdsLong = inputIds.map { tokenIds => tokenIds.last.toLong }
+        (inpIdsLong, nextPositionId.get)
+      } else {
+        val inpIdsLong = inputIds.flatMap { tokenIds => tokenIds.map(_.toLong) }
+        val posIdsLong = inputIds.flatMap { tokenIds =>
+          tokenIds.zipWithIndex.map { case (_, i) =>
+            i.toLong
+          }
+        }
+        (inpIdsLong, posIdsLong)
+      }
+    val attentionMask: Array[Long] =
+      inputIds.flatMap { tokenIds => tokenIds.map(_ => 1L) }
+
+    val batchSize: Int = inputIds.length
+    val beamIdx: Array[Int] = new Array[Int](batchSize)
+    val shape: Array[Int] = Array(batchSize, inputIdsLong.length / batchSize)
+
+    val inputIdsLongTensor: org.intel.openvino.Tensor =
+      new org.intel.openvino.Tensor(shape, inputIdsLong)
+    val decoderAttentionMask: org.intel.openvino.Tensor =
+      new org.intel.openvino.Tensor(Array(batchSize, inputIds.head.length), attentionMask)
+    val decoderPositionIDs: org.intel.openvino.Tensor =
+      new org.intel.openvino.Tensor(shape, inputPositionIDsLong)
+    val beamIdxTensor: org.intel.openvino.Tensor =
+      new org.intel.openvino.Tensor(Array(batchSize), beamIdx)
+
+    inferRequest.set_tensor(OpenVinoSignatures.decoderInputIDs, inputIdsLongTensor)
+    inferRequest.set_tensor(OpenVinoSignatures.decoderAttentionMask, decoderAttentionMask)
+    inferRequest.set_tensor(OpenVinoSignatures.decoderPositionIDs, decoderPositionIDs)
+    inferRequest.set_tensor(OpenVinoSignatures.decoderBeamIdx, beamIdxTensor)
+
+    inferRequest.infer()
+
+    val result = inferRequest.get_tensor(OpenVinoSignatures.decoderOutput)
+    val logitsRaw = result.data()
+    nextPositionId = Some(inputIds.map(tokenIds => tokenIds.length.toLong))
+
+    val sequenceLength = inputIdsLong.length / batchSize
+    val decoderOutputs = (0 until batchSize).map(i => {
+      logitsRaw
+        .slice(
+          i * sequenceLength * vocabSize + (sequenceLength - 1) * vocabSize,
+          i * sequenceLength * vocabSize + sequenceLength * vocabSize)
+    })
+    decoderOutputs.toArray
   }
   private def getDecoderOutputs(
       inputIds: Array[Array[Int]],
@@ -285,12 +367,12 @@ private[johnsnowlabs] class Qwen(
     val sequenceLength = inputIds.head.length
     val batchSize = inputIds.length
 
-//    val logits = sessionOutput.getFloatArray(OnnxSignatures.decoderOutput)
-//    inputIdsLongTensor.close()
-//    decoderPositionIDs.close()
-//    decoderAttentionMask.close()
-//    val batchLogits = logits.grouped(vocabSize).toArray
-//    batchLogits
+    //    val logits = sessionOutput.getFloatArray(OnnxSignatures.decoderOutput)
+    //    inputIdsLongTensor.close()
+    //    decoderPositionIDs.close()
+    //    decoderAttentionMask.close()
+    //    val batchLogits = logits.grouped(vocabSize).toArray
+    //    batchLogits
 
     val logitsRaw = sessionOutput.getFloatArray(OnnxSignatures.decoderOutput)
     val decoderOutputs = (0 until batchSize).map(i => {
@@ -358,4 +440,19 @@ private[johnsnowlabs] class Qwen(
       (0 until 32).flatMap(i => Seq(s"present.$i.key", s"present.$i.value")).toArray
   }
 
+  private object OpenVinoSignatures {
+    val encoderInputIDs: String = "input_ids"
+    val encoderAttentionMask: String = "attention_mask"
+
+    val encoderOutput: String = "last_hidden_state"
+
+    val decoderInputIDs: String = "input_ids"
+    val decoderEncoderAttentionMask: String = "encoder_attention_mask"
+    val decoderAttentionMask: String = "attention_mask"
+    val decoderPositionIDs: String = "position_ids"
+    val decoderBeamIdx: String = "beam_idx"
+    val decoderEncoderState: String = "encoder_hidden_states"
+
+    val decoderOutput: String = "logits"
+  }
 }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTransformer.scala
index 479d7cbb045d34..9fd834a577cb47 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTransformer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/QwenTransformer.scala
@@ -20,6 +20,7 @@ import com.johnsnowlabs.ml.ai.util.Generation.GenerationConfig
 import com.johnsnowlabs.ml.ai.Qwen
 import com.johnsnowlabs.ml.onnx.OnnxWrapper.DecoderWrappers
 import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel}
+import com.johnsnowlabs.ml.openvino.{OpenvinoWrapper, ReadOpenvinoModel, WriteOpenvinoModel}
 import com.johnsnowlabs.ml.util.LoadExternalModel.{
   loadJsonStringAsset,
   loadSentencePieceAsset,
@@ -27,7 +28,7 @@ import com.johnsnowlabs.ml.util.LoadExternalModel.{
   modelSanityCheck,
   notSupportedEngineError
 }
-import com.johnsnowlabs.ml.util.ONNX
+import com.johnsnowlabs.ml.util.{ONNX, Openvino}
 import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT
 import com.johnsnowlabs.nlp._
 import com.johnsnowlabs.ml.tensorflow.sentencepiece.{
@@ -161,10 +162,11 @@ class QwenTransformer(override val uid: String)
     with HasBatchedAnnotate[QwenTransformer]
     with ParamsAndFeaturesWritable
     with WriteOnnxModel
+    with WriteOpenvinoModel
     with HasGeneratorProperties
     with HasEngine {
 
-  def this() = this(Identifiable.randomUID("QwenTRANSFORMER"))
+  def this() = this(Identifiable.randomUID("QWENTRANSFORMER"))
 
   /** Input annotator type : DOCUMENT
     *
@@ -232,12 +234,16 @@ class QwenTransformer(override val uid: String)
   def getGenerationConfig: GenerationConfig = $$(generationConfig)
 
   /** @group setParam */
-  def setModelIfNotSet(spark: SparkSession, onnxWrappers: DecoderWrappers): this.type = {
+  def setModelIfNotSet(
+      spark: SparkSession,
+      onnxWrappers: Option[DecoderWrappers],
+      openvinoWrapper: Option[OpenvinoWrapper]): this.type = {
     if (_model.isEmpty) {
       _model = Some(
         spark.sparkContext.broadcast(
           new Qwen(
             onnxWrappers,
+            openvinoWrapper,
             $$(merges),
             $$(vocabulary),
             generationConfig = getGenerationConfig)))
@@ -260,7 +266,8 @@ class QwenTransformer(override val uid: String)
     ignoreTokenIds -> Array(),
     batchSize -> 1,
     beamSize -> 1,
-    maxInputLength -> 4096)
+    maxInputLength -> 4096,
+    stopTokenIds -> Array())
 
   /** takes a document and annotations and produces new annotations of this annotator's annotation
     * type
@@ -294,7 +301,8 @@ class QwenTransformer(override val uid: String)
         randomSeed = this.randomSeed,
         ignoreTokenIds = $(ignoreTokenIds),
         beamSize = $(beamSize),
-        maxInputLength = $(maxInputLength))
+        maxInputLength = $(maxInputLength),
+        stopTokenIds = $(stopTokenIds))
     } else {
       Seq()
     }
@@ -309,8 +317,16 @@ class QwenTransformer(override val uid: String)
         writeOnnxModels(
           path,
           spark,
-          Seq((wrappers.decoder, "decoder_model.onnx")),
+          Seq((wrappers.get.decoder, "decoder_model.onnx")),
           QwenTransformer.suffix)
+      case Openvino.name =>
+        val wrappers = getModelIfNotSet.openvinoWrapper
+        writeOpenvinoModel(
+          path,
+          spark,
+          wrappers.get,
+          QwenTransformer.suffix,
+          QwenTransformer.openvinoFile)
     }
   }
 }
@@ -332,11 +348,12 @@ trait ReadablePretrainedQwenTransformerModel
     super.pretrained(name, lang, remoteLoc)
 }
 
-trait ReadQwenTransformerDLModel extends ReadOnnxModel {
+trait ReadQwenTransformerDLModel extends ReadOnnxModel with ReadOpenvinoModel {
   this: ParamsAndFeaturesReadable[QwenTransformer] =>
 
   override val onnxFile: String = "qwen_onnx"
   val suffix: String = "_qwen"
+  override val openvinoFile: String = "qwen_openvino"
 
   def readModel(instance: QwenTransformer, path: String, spark: SparkSession): Unit = {
     instance.getEngine match {
@@ -345,7 +362,11 @@ trait ReadQwenTransformerDLModel extends ReadOnnxModel {
           readOnnxModels(path, spark, Seq("decoder_model.onnx"), suffix)
         val onnxWrappers =
           DecoderWrappers(decoder = wrappers("decoder_model.onnx"))
-        instance.setModelIfNotSet(spark, onnxWrappers)
+        instance.setModelIfNotSet(spark, Some(onnxWrappers), None)
+      case Openvino.name =>
+        val ovWrapper =
+          readOpenvinoModel(path, spark, "_qwen_ov")
+        instance.setModelIfNotSet(spark, None, Some(ovWrapper))
       case _ =>
         throw new Exception(notSupportedEngineError)
     }
@@ -353,7 +374,10 @@ trait ReadQwenTransformerDLModel extends ReadOnnxModel {
 
   addReader(readModel)
 
-  def loadSavedModel(modelPath: String, spark: SparkSession): QwenTransformer = {
+  def loadSavedModel(
+      modelPath: String,
+      spark: SparkSession,
+      useOpenvino: Boolean = false): QwenTransformer = {
     implicit val formats: DefaultFormats.type = DefaultFormats // for json4
     val (localModelPath, detectedEngine) =
       modelSanityCheck(modelPath, isDecoder = true)
@@ -405,12 +429,18 @@ trait ReadQwenTransformerDLModel extends ReadOnnxModel {
       .setVocabulary(vocabs)
       .setMerges(bytePairs)
 
-    annotatorModel.set(annotatorModel.engine, detectedEngine)
+    val modelEngine =
+      if (useOpenvino)
+        Openvino.name
+      else
+        detectedEngine
+    annotatorModel.set(annotatorModel.engine, modelEngine)
 
     detectedEngine match {
       case ONNX.name =>
         val onnxWrapperDecoder =
           OnnxWrapper.read(
+            spark,
             localModelPath,
             zipped = false,
             useBundle = true,
@@ -419,7 +449,16 @@ trait ReadQwenTransformerDLModel extends ReadOnnxModel {
         val onnxWrappers = DecoderWrappers(onnxWrapperDecoder)
 
         annotatorModel
-          .setModelIfNotSet(spark, onnxWrappers)
+          .setModelIfNotSet(spark, Some(onnxWrappers), None)
+      case Openvino.name =>
+        val openvinoWrapper =
+          OpenvinoWrapper.read(
+            spark,
+            localModelPath,
+            zipped = false,
+            useBundle = true,
+            detectedEngine = detectedEngine)
+        annotatorModel.setModelIfNotSet(spark, None, Some(openvinoWrapper))
 
       case _ =>
         throw new Exception(notSupportedEngineError)