diff --git a/docs/en/annotator_entries/AutoGGUFEmbeddings.md b/docs/en/annotator_entries/AutoGGUFEmbeddings.md
new file mode 100644
index 00000000000000..9c872393a515dc
--- /dev/null
+++ b/docs/en/annotator_entries/AutoGGUFEmbeddings.md
@@ -0,0 +1,123 @@
+{%- capture title -%}
+AutoGGUFEmbeddings
+{%- endcapture -%}
+
+{%- capture description -%}
+Annotator that uses the llama.cpp library to generate text embeddings with large language
+models.
+
+The type of embedding pooling can be set with the `setPoolingType` method. The default is
+`"MEAN"`. The available options are `"NONE"`, `"MEAN"`, `"CLS"`, and `"LAST"`.
+
+If the parameters are not set, the annotator will default to use the parameters provided by
+the model.
+
+Pretrained models can be loaded with `pretrained` of the companion object:
+
+```scala
+val autoGGUFEmbeddings = AutoGGUFEmbeddings.pretrained()
+  .setInputCols("document")
+  .setOutputCol("embeddings")
+```
+
+The default model is `"nomic-embed-text-v1.5.Q8_0.gguf"`, if no name is provided.
+
+For available pretrained models please see the [Models Hub](https://sparknlp.org/models).
+
+For extended examples of usage, see the
+[AutoGGUFEmbeddingsTest](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFEmbeddingsTest.scala)
+and the
+[example notebook](https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFEmbeddings.ipynb).
+
+**Note**: To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set
+the number of GPU layers with the `setNGpuLayers` method.
+
+When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers`
+according to your hardware to avoid out-of-memory errors.
+{%- endcapture -%}
+
+{%- capture input_anno -%}
+DOCUMENT
+{%- endcapture -%}
+
+{%- capture output_anno -%}
+SENTENCE_EMBEDDINGS
+{%- endcapture -%}
+
+{%- capture python_example -%}
+>>> import sparknlp
+>>> from sparknlp.base import *
+>>> from sparknlp.annotator import *
+>>> from pyspark.ml import Pipeline
+>>> document = DocumentAssembler() \
+...     .setInputCol("text") \
+...     .setOutputCol("document")
+>>> autoGGUFEmbeddings = AutoGGUFEmbeddings.pretrained() \
+...     .setInputCols(["document"]) \
+...     .setOutputCol("completions") \
+...     .setBatchSize(4) \
+...     .setNGpuLayers(99) \
+...     .setPoolingType("MEAN")
+>>> pipeline = Pipeline().setStages([document, autoGGUFEmbeddings])
+>>> data = spark.createDataFrame([["The moons of Jupiter are 77 in total, with 79 confirmed natural satellites and 2 man-made ones."]]).toDF("text")
+>>> result = pipeline.fit(data).transform(data)
+>>> result.select("completions").show()
++--------------------------------------------------------------------------------+
+|                                                                      embeddings|
++--------------------------------------------------------------------------------+
+|[[-0.034486726, 0.07770534, -0.15982522, -0.017873349, 0.013914132, 0.0365736...|
++--------------------------------------------------------------------------------+
+{%- endcapture -%}
+
+{%- capture scala_example -%}
+import com.johnsnowlabs.nlp.base._
+import com.johnsnowlabs.nlp.annotator._
+import org.apache.spark.ml.Pipeline
+import spark.implicits._
+
+val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
+
+val autoGGUFEmbeddings = AutoGGUFEmbeddings
+  .pretrained()
+  .setInputCols("document")
+  .setOutputCol("embeddings")
+  .setBatchSize(4)
+  .setPoolingType("MEAN")
+
+val pipeline = new Pipeline().setStages(Array(document, autoGGUFEmbeddings))
+
+val data = Seq(
+  "The moons of Jupiter are 77 in total, with 79 confirmed natural satellites and 2 man-made ones.")
+  .toDF("text")
+val result = pipeline.fit(data).transform(data)
+result.select("embeddings.embeddings").show(1, truncate=80)
++--------------------------------------------------------------------------------+
+|                                                                      embeddings|
++--------------------------------------------------------------------------------+
+|[[-0.034486726, 0.07770534, -0.15982522, -0.017873349, 0.013914132, 0.0365736...|
++--------------------------------------------------------------------------------+
+{%- endcapture -%}
+
+{%- capture api_link -%}
+[AutoGGUFEmbeddings](/api/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings)
+{%- endcapture -%}
+
+{%- capture python_api_link -%}
+[AutoGGUFEmbeddings](/api/python/reference/autosummary/sparknlp/annotator/embeddings/auto_gguf_embeddings/index.html)
+{%- endcapture -%}
+
+{%- capture source_link -%}
+[AutoGGUFEmbeddings](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala)
+{%- endcapture -%}
+
+{% include templates/anno_template.md
+title=title
+description=description
+input_anno=input_anno
+output_anno=output_anno
+python_example=python_example
+scala_example=scala_example
+api_link=api_link
+python_api_link=python_api_link
+source_link=source_link
+%}
\ No newline at end of file
diff --git a/docs/en/annotator_entries/AutoGGUF.md b/docs/en/annotator_entries/AutoGGUFModel.md
similarity index 100%
rename from docs/en/annotator_entries/AutoGGUF.md
rename to docs/en/annotator_entries/AutoGGUFModel.md
diff --git a/docs/en/annotators.md b/docs/en/annotators.md
index 4526453a7ebc94..c5c21707b80f8e 100644
--- a/docs/en/annotators.md
+++ b/docs/en/annotators.md
@@ -45,6 +45,7 @@ There are two types of Annotators:
 {:.table-model-big}
 |Annotator|Description|Version |
 |---|---|---|
+{% include templates/anno_table_entry.md path="" name="AutoGGUFEmbeddings" summary="Annotator that uses the llama.cpp library to generate text embeddings with large language models."%}
 {% include templates/anno_table_entry.md path="" name="AutoGGUFModel" summary="Annotator that uses the llama.cpp library to generate text completions with large language models."%}
 {% include templates/anno_table_entry.md path="" name="BGEEmbeddings" summary="Sentence embeddings using BGE."%}
 {% include templates/anno_table_entry.md path="" name="BigTextMatcher" summary="Annotator to match exact phrases (by token) provided in a file against a Document."%}
diff --git a/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb b/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb
index 9eb0f1884e8bb7..d4152e51194c25 100644
--- a/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb
+++ b/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb
@@ -251,7 +251,7 @@
    "provenance": []
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "sparknlp_dev",
    "language": "python",
    "name": "python3"
   },
@@ -264,7 +264,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFEmbeddings.ipynb b/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFEmbeddings.ipynb
new file mode 100644
index 00000000000000..2adfdad89625ec
--- /dev/null
+++ b/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFEmbeddings.ipynb
@@ -0,0 +1,429 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFEmbeddings.ipynb)\n",
+    "\n",
+    "# llama.cpp 🦙 embedding models in Spark NLP 🚀\n",
+    "\n",
+    "Let's keep in mind a few things before we start 😊\n",
+    "\n",
+    "- Support for llama.cpp embeddings was introduced in `Spark NLP 5.5.1`, enabling quantized LLM inference on a wide range of devices. Please make sure you have upgraded to the latest Spark NLP release.\n",
+    "- You need to use your own `.gguf` model files, which also include the models from the [Hugging Face Models](https://huggingface.co/models?library=gguf)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download a GGUF Model\n",
+    "\n",
+    "Lets download a GGUF model to test it out. For this, we will use [nomic-ai/nomic-embed-text-v1.5-GGUF](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF). We can download the model by selecting the Q8_0 GGUF file from the \"Files and versions\" tab.\n",
+    "\n",
+    "Once downloaded, we can directly import this model into Spark NLP!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2024-11-02 13:42:45--  https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.Q8_0.gguf?download=true\n",
+      "Resolving huggingface.co (huggingface.co)... 3.160.39.87, 3.160.39.100, 3.160.39.99, ...\n",
+      "Connecting to huggingface.co (huggingface.co)|3.160.39.87|:443... connected.\n",
+      "HTTP request sent, awaiting response... 302 Found\n",
+      "Location: https://cdn-lfs-us-1.hf.co/repos/19/39/19396cd98fe8b02e39b1be815db29f6b251fee34fc5d6550db0b478083fdda2f/f7af6f66802f4df86eda10fe9bbcfc75c39562bed48ef6ace719a251cf1c2fdb?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27nomic-embed-text-v1.5.Q8_0.gguf%3B+filename%3D%22nomic-embed-text-v1.5.Q8_0.gguf%22%3B&Expires=1730810566&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMDgxMDU2Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzE5LzM5LzE5Mzk2Y2Q5OGZlOGIwMmUzOWIxYmU4MTVkYjI5ZjZiMjUxZmVlMzRmYzVkNjU1MGRiMGI0NzgwODNmZGRhMmYvZjdhZjZmNjY4MDJmNGRmODZlZGExMGZlOWJiY2ZjNzVjMzk1NjJiZWQ0OGVmNmFjZTcxOWEyNTFjZjFjMmZkYj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=R7WUe1icdziIE4kS%7EHMcsyiySLOHJpkJ9lM2nC6EtIwPj6V12RRpjpDIyyv0%7EY9h32v2HDomyyNO6-Ry5UeIm3UjyESR9In3kmTKAqZX2zzlslTQMXDQghmLpIEQGgmh5-5RHrFgYPNxKmVICyQL1Vz9IgFQRMfdug6RBTGgmmXfLgksa9IU7TdvZcqvOb68HCdmv1hEt2U5vH4A9MF81ohMBqrvTb9389jzrlP1tZtNFb5wjNdZDmr57XIsvQRZB0ZDUIsMT1nc5QehNpWpX4jMLBSnkj1-oL9XN7%7EhAXDbB1mTH9kbrD3UUNKRm4%7ER-gVhegqsfirdSFi66sP3bg__&Key-Pair-Id=K24J24Z295AEI9 [following]\n",
+      "--2024-11-02 13:42:46--  https://cdn-lfs-us-1.hf.co/repos/19/39/19396cd98fe8b02e39b1be815db29f6b251fee34fc5d6550db0b478083fdda2f/f7af6f66802f4df86eda10fe9bbcfc75c39562bed48ef6ace719a251cf1c2fdb?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27nomic-embed-text-v1.5.Q8_0.gguf%3B+filename%3D%22nomic-embed-text-v1.5.Q8_0.gguf%22%3B&Expires=1730810566&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMDgxMDU2Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzE5LzM5LzE5Mzk2Y2Q5OGZlOGIwMmUzOWIxYmU4MTVkYjI5ZjZiMjUxZmVlMzRmYzVkNjU1MGRiMGI0NzgwODNmZGRhMmYvZjdhZjZmNjY4MDJmNGRmODZlZGExMGZlOWJiY2ZjNzVjMzk1NjJiZWQ0OGVmNmFjZTcxOWEyNTFjZjFjMmZkYj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=R7WUe1icdziIE4kS%7EHMcsyiySLOHJpkJ9lM2nC6EtIwPj6V12RRpjpDIyyv0%7EY9h32v2HDomyyNO6-Ry5UeIm3UjyESR9In3kmTKAqZX2zzlslTQMXDQghmLpIEQGgmh5-5RHrFgYPNxKmVICyQL1Vz9IgFQRMfdug6RBTGgmmXfLgksa9IU7TdvZcqvOb68HCdmv1hEt2U5vH4A9MF81ohMBqrvTb9389jzrlP1tZtNFb5wjNdZDmr57XIsvQRZB0ZDUIsMT1nc5QehNpWpX4jMLBSnkj1-oL9XN7%7EhAXDbB1mTH9kbrD3UUNKRm4%7ER-gVhegqsfirdSFi66sP3bg__&Key-Pair-Id=K24J24Z295AEI9\n",
+      "Resolving cdn-lfs-us-1.hf.co (cdn-lfs-us-1.hf.co)... 18.66.2.2, 18.66.2.116, 18.66.2.98, ...\n",
+      "Connecting to cdn-lfs-us-1.hf.co (cdn-lfs-us-1.hf.co)|18.66.2.2|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 274290560 (262M) [application/octet-stream]\n",
+      "Saving to: ‘nomic-embed-text-v1.5.Q8_0.gguf’\n",
+      "\n",
+      "nomic-embed-text-v1 100%[===================>] 261.58M  23.8MB/s    in 10s     \n",
+      "\n",
+      "2024-11-02 13:42:56 (24.9 MB/s) - ‘nomic-embed-text-v1.5.Q8_0.gguf’ saved [274290560/274290560]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "EXPORT_PATH = \"nomic-embed-text-v1.5.Q8_0.gguf\"\n",
+    "! wget \"https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/{EXPORT_PATH}?download=true\" -O  {EXPORT_PATH}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import and Save AutGGUF models in Spark NLP\n",
+    "\n",
+    "- Let's install and setup Spark NLP (if running it Google Colab)\n",
+    "- This part is pretty easy via our simple script"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Only execute this if you are on Google Colab\n",
+    "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's start Spark with Spark NLP included via our simple `start()` function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sparknlp\n",
+    "\n",
+    "# let's start Spark with Spark NLP with GPU enabled. If you don't have GPUs available remove this parameter.\n",
+    "spark = sparknlp.start(gpu=True)\n",
+    "print(sparknlp.version())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Let's use the `loadSavedModel` functon in `AutoGGUFModel`\n",
+    "- Most params will be set automatically. They can also be set later after loading the model in `AutoGGUFModel` during runtime, so don't worry about setting them now.\n",
+    "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n",
+    "- We can set the model to embedding mode with `setEmbedding`. Afterwards the model will return the embeddings in the Annotations.\n",
+    "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "jsl-llama: Extracted 'libjllama.so' to '/tmp/libjllama.so'\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sparknlp.annotator import *\n",
+    "\n",
+    "# All these params should be identical to the original ONNX model\n",
+    "autoGGUFEmbeddings = (\n",
+    "    AutoGGUFEmbeddings.loadSavedModel(EXPORT_PATH, spark)\n",
+    "    .setInputCols(\"document\")\n",
+    "    .setOutputCol(\"embeddings\")\n",
+    "    .setBatchSize(4)\n",
+    "    .setNGpuLayers(99)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "24/11/02 13:48:29 WARN TaskSetManager: Stage 0 contains a task of very large size (1073 KiB). The maximum recommended task size is 1000 KiB.\n"
+     ]
+    }
+   ],
+   "source": [
+    "autoGGUFEmbeddings.write().overwrite().save(f\"nomic-embed-text-v1.5.Q8_0.gguf_spark_nlp\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's clean up stuff we don't need anymore"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!rm -rf {EXPORT_PATH}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Awesome  😎 !\n",
+    "\n",
+    "This is your GGUF model from loaded and saved by Spark NLP 🚀"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total 267872\n",
+      "drwxr-xr-x 2 root root      4096 Nov  2 13:48 metadata\n",
+      "-rwxrwxr-x 1 root root 274290560 Nov  2 13:48 nomic-embed-text-v1.5.Q8_0.gguf\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls -l nomic-embed-text-v1.5.Q8_0.gguf_spark_nlp/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny GGUF model 😊"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "24/11/02 13:48:57 WARN SparkContext: The path /home/root/Workspace/scala/spark-nlp/examples/python/llama.cpp/nomic-embed-text-v1.5.Q8_0.gguf_spark_nlp/nomic-embed-text-v1.5.Q8_0.gguf has been added already. Overwriting of added paths is not supported in the current version.\n",
+      "24/11/02 13:48:57 WARN DAGScheduler: Broadcasting large task binary with size 1028.0 KiB\n",
+      "24/11/02 13:48:57 WARN DAGScheduler: Broadcasting large task binary with size 1028.0 KiB\n",
+      "24/11/02 13:48:57 WARN DAGScheduler: Broadcasting large task binary with size 1028.0 KiB\n",
+      "llama_model_loader: loaded meta data with 22 key-value pairs and 112 tensors from /tmp/spark-6de50aee-1059-4698-98e2-db9d68663467/userFiles-932de0e7-9a8f-41f5-9aaf-94bb7406df74/nomic-embed-text-v1.5.Q8_0.gguf (version GGUF V3 (latest))\n",
+      "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
+      "llama_model_loader: - kv   0:                       general.architecture str              = nomic-bert\n",
+      "llama_model_loader: - kv   1:                               general.name str              = nomic-embed-text-v1.5\n",
+      "llama_model_loader: - kv   2:                     nomic-bert.block_count u32              = 12\n",
+      "llama_model_loader: - kv   3:                  nomic-bert.context_length u32              = 2048\n",
+      "llama_model_loader: - kv   4:                nomic-bert.embedding_length u32              = 768\n",
+      "llama_model_loader: - kv   5:             nomic-bert.feed_forward_length u32              = 3072\n",
+      "llama_model_loader: - kv   6:            nomic-bert.attention.head_count u32              = 12\n",
+      "llama_model_loader: - kv   7:    nomic-bert.attention.layer_norm_epsilon f32              = 0.000000\n",
+      "llama_model_loader: - kv   8:                          general.file_type u32              = 1\n",
+      "llama_model_loader: - kv   9:                nomic-bert.attention.causal bool             = false\n",
+      "llama_model_loader: - kv  10:                    nomic-bert.pooling_type u32              = 1\n",
+      "llama_model_loader: - kv  11:                  nomic-bert.rope.freq_base f32              = 1000.000000\n",
+      "llama_model_loader: - kv  12:            tokenizer.ggml.token_type_count u32              = 2\n",
+      "llama_model_loader: - kv  13:                tokenizer.ggml.bos_token_id u32              = 101\n",
+      "llama_model_loader: - kv  14:                tokenizer.ggml.eos_token_id u32              = 102\n",
+      "llama_model_loader: - kv  15:                       tokenizer.ggml.model str              = bert\n",
+      "llama_model_loader: - kv  16:                      tokenizer.ggml.tokens arr[str,30522]   = [\"[PAD]\", \"[unused0]\", \"[unused1]\", \"...\n",
+      "llama_model_loader: - kv  17:                      tokenizer.ggml.scores arr[f32,30522]   = [-1000.000000, -1000.000000, -1000.00...\n",
+      "llama_model_loader: - kv  18:                  tokenizer.ggml.token_type arr[i32,30522]   = [3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\n",
+      "llama_model_loader: - kv  19:            tokenizer.ggml.unknown_token_id u32              = 100\n",
+      "llama_model_loader: - kv  20:          tokenizer.ggml.seperator_token_id u32              = 102\n",
+      "llama_model_loader: - kv  21:            tokenizer.ggml.padding_token_id u32              = 0\n",
+      "llama_model_loader: - type  f32:   51 tensors\n",
+      "llama_model_loader: - type  f16:   61 tensors\n",
+      "llm_load_vocab: special tokens cache size = 5\n",
+      "llm_load_vocab: token to piece cache size = 0.2032 MB\n",
+      "llm_load_print_meta: format           = GGUF V3 (latest)\n",
+      "llm_load_print_meta: arch             = nomic-bert\n",
+      "llm_load_print_meta: vocab type       = WPM\n",
+      "llm_load_print_meta: n_vocab          = 30522\n",
+      "llm_load_print_meta: n_merges         = 0\n",
+      "llm_load_print_meta: vocab_only       = 0\n",
+      "llm_load_print_meta: n_ctx_train      = 2048\n",
+      "llm_load_print_meta: n_embd           = 768\n",
+      "llm_load_print_meta: n_layer          = 12\n",
+      "llm_load_print_meta: n_head           = 12\n",
+      "llm_load_print_meta: n_head_kv        = 12\n",
+      "llm_load_print_meta: n_rot            = 64\n",
+      "llm_load_print_meta: n_swa            = 0\n",
+      "llm_load_print_meta: n_embd_head_k    = 64\n",
+      "llm_load_print_meta: n_embd_head_v    = 64\n",
+      "llm_load_print_meta: n_gqa            = 1\n",
+      "llm_load_print_meta: n_embd_k_gqa     = 768\n",
+      "llm_load_print_meta: n_embd_v_gqa     = 768\n",
+      "llm_load_print_meta: f_norm_eps       = 1.0e-12\n",
+      "llm_load_print_meta: f_norm_rms_eps   = 0.0e+00\n",
+      "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
+      "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
+      "llm_load_print_meta: f_logit_scale    = 0.0e+00\n",
+      "llm_load_print_meta: n_ff             = 3072\n",
+      "llm_load_print_meta: n_expert         = 0\n",
+      "llm_load_print_meta: n_expert_used    = 0\n",
+      "llm_load_print_meta: causal attn      = 0\n",
+      "llm_load_print_meta: pooling type     = 1\n",
+      "llm_load_print_meta: rope type        = 2\n",
+      "llm_load_print_meta: rope scaling     = linear\n",
+      "llm_load_print_meta: freq_base_train  = 1000.0\n",
+      "llm_load_print_meta: freq_scale_train = 1\n",
+      "llm_load_print_meta: n_ctx_orig_yarn  = 2048\n",
+      "llm_load_print_meta: rope_finetuned   = unknown\n",
+      "llm_load_print_meta: ssm_d_conv       = 0\n",
+      "llm_load_print_meta: ssm_d_inner      = 0\n",
+      "llm_load_print_meta: ssm_d_state      = 0\n",
+      "llm_load_print_meta: ssm_dt_rank      = 0\n",
+      "llm_load_print_meta: model type       = 137M\n",
+      "llm_load_print_meta: model ftype      = F16\n",
+      "llm_load_print_meta: model params     = 136.73 M\n",
+      "llm_load_print_meta: model size       = 260.86 MiB (16.00 BPW) \n",
+      "llm_load_print_meta: general.name     = nomic-embed-text-v1.5\n",
+      "llm_load_print_meta: BOS token        = 101 '[CLS]'\n",
+      "llm_load_print_meta: EOS token        = 102 '[SEP]'\n",
+      "llm_load_print_meta: UNK token        = 100 '[UNK]'\n",
+      "llm_load_print_meta: SEP token        = 102 '[SEP]'\n",
+      "llm_load_print_meta: PAD token        = 0 '[PAD]'\n",
+      "llm_load_print_meta: CLS token        = 101 '[CLS]'\n",
+      "llm_load_print_meta: MASK token       = 103 '[MASK]'\n",
+      "llm_load_print_meta: LF token         = 0 '[PAD]'\n",
+      "llm_load_print_meta: max token length = 21\n",
+      "llm_load_tensors: ggml ctx size =    0.05 MiB\n",
+      "llm_load_tensors:        CPU buffer size =   260.86 MiB\n",
+      ".......................................................\n",
+      "llama_new_context_with_model: n_ctx      = 4096\n",
+      "llama_new_context_with_model: n_batch    = 512\n",
+      "llama_new_context_with_model: n_ubatch   = 512\n",
+      "llama_new_context_with_model: flash_attn = 0\n",
+      "llama_new_context_with_model: freq_base  = 1000.0\n",
+      "llama_new_context_with_model: freq_scale = 1\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[WARN] Not compiled with GPU offload support, --n-gpu-layers option will be ignored. See main README.md for information on enabling GPU BLAS support n_gpu_layers=-1\n",
+      "[INFO] build info build=3534 commit=\"641f5dd2\"\n",
+      "[INFO] system info n_threads=6 n_threads_batch=-1 total_threads=6 system_info=\"AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | \"\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "llama_kv_cache_init:        CPU KV buffer size =   144.00 MiB\n",
+      "llama_new_context_with_model: KV self size  =  144.00 MiB, K (f16):   72.00 MiB, V (f16):   72.00 MiB\n",
+      "llama_new_context_with_model:        CPU  output buffer size =     0.00 MiB\n",
+      "ggml_gallocr_reserve_n: reallocating CPU buffer from size 0.00 MiB to 23.00 MiB\n",
+      "llama_new_context_with_model:        CPU compute buffer size =    23.00 MiB\n",
+      "llama_new_context_with_model: graph nodes  = 453\n",
+      "llama_new_context_with_model: graph splits = 1\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[INFO] initializing slots n_slots=4\n",
+      "[INFO] new slot id_slot=0 n_ctx_slot=1024\n",
+      "[INFO] new slot id_slot=1 n_ctx_slot=1024\n",
+      "[INFO] new slot id_slot=2 n_ctx_slot=1024\n",
+      "[INFO] new slot id_slot=3 n_ctx_slot=1024\n",
+      "[INFO] model loaded\n",
+      "[INFO] chat template chat_example=\"<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n<|im_start|>user\\nHello<|im_end|>\\n<|im_start|>assistant\\nHi there<|im_end|>\\n<|im_start|>user\\nHow are you?<|im_end|>\\n<|im_start|>assistant\\n\" built_in=true\n",
+      "[INFO] slot is processing task id_slot=0 id_task=0\n",
+      "[INFO] kv cache rm [p0, end) id_slot=0 id_task=0 p0=0\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Stage 12:>                                                         (0 + 1) / 1]\r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[INFO] slot released id_slot=0 id_task=0 n_ctx=4096 n_past=7 n_system_tokens=0 n_cache_tokens=0 truncated=false\n",
+      "[INFO] all slots are idle\n",
+      "+--------------------------------------------------------------------------------+\n",
+      "|                                                                      embeddings|\n",
+      "+--------------------------------------------------------------------------------+\n",
+      "|[[0.046383496, 0.02353651, -0.12484242, -0.009759982, 0.05522549, -0.01701891...|\n",
+      "+--------------------------------------------------------------------------------+\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    }
+   ],
+   "source": [
+    "import sparknlp\n",
+    "from sparknlp.base import *\n",
+    "from sparknlp.annotator import *\n",
+    "from pyspark.ml import Pipeline\n",
+    "\n",
+    "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n",
+    "\n",
+    "autoGGUFEmbeddings = AutoGGUFEmbeddings.load(\"nomic-embed-text-v1.5.Q8_0.gguf_spark_nlp\")\n",
+    "\n",
+    "pipeline = Pipeline().setStages([document_assembler, autoGGUFEmbeddings])\n",
+    "\n",
+    "data = spark.createDataFrame([[\"This is a sentence.\"]]).toDF(\"text\")\n",
+    "\n",
+    "result = pipeline.fit(data).transform(data)\n",
+    "result.select(\"embeddings.embeddings\").show(1, 80)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "That's it! You can now go wild and use hundreds of GGUF models from HuggingFace 🤗 in Spark NLP 🚀\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/python/sparknlp/annotator/embeddings/__init__.py b/python/sparknlp/annotator/embeddings/__init__.py
index be622971684f8b..da453d2c555037 100644
--- a/python/sparknlp/annotator/embeddings/__init__.py
+++ b/python/sparknlp/annotator/embeddings/__init__.py
@@ -40,3 +40,4 @@
 from sparknlp.annotator.embeddings.mxbai_embeddings import *
 from sparknlp.annotator.embeddings.snowflake_embeddings import *
 from sparknlp.annotator.embeddings.nomic_embeddings import *
+from sparknlp.annotator.embeddings.auto_gguf_embeddings import *
diff --git a/python/sparknlp/annotator/embeddings/auto_gguf_embeddings.py b/python/sparknlp/annotator/embeddings/auto_gguf_embeddings.py
new file mode 100755
index 00000000000000..30cee663c16129
--- /dev/null
+++ b/python/sparknlp/annotator/embeddings/auto_gguf_embeddings.py
@@ -0,0 +1,538 @@
+#  Copyright 2017-2023 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the AutoGGUFEmbeddings."""
+from typing import List
+
+from sparknlp.common import *
+
+
+class AutoGGUFEmbeddings(AnnotatorModel, HasBatchedAnnotate):
+    """
+    Annotator that uses the llama.cpp library to generate text embeddings with large language
+    models
+
+    The type of embedding pooling can be set with the `setPoolingType` method. The default is
+    `"MEAN"`. The available options are `"NONE"`, `"MEAN"`, `"CLS"`, and `"LAST"`.
+
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+
+    >>> auto_gguf_model = AutoGGUFEmbeddings.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("embeddings")
+
+    The default model is ``"nomic-embed-text-v1.5.Q8_0.gguf"``, if no name is provided.
+
+    For extended examples of usage, see the
+    `AutoGGUFEmbeddingsTest <https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTest.scala>`__
+    and the
+    `example notebook <https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFEmbeddings.ipynb>`__.
+
+    For available pretrained models please see the `Models Hub <https://sparknlp.org/models>`__.
+
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``           ``SENTENCE_EMBEDDINGS``
+    ====================== ======================
+
+    Parameters
+    ----------
+    nThreads
+        Set the number of threads to use during generation
+    nThreadsBatch
+        Set the number of threads to use during batch and prompt processing
+    nCtx
+        Set the size of the prompt context
+    nBatch
+        Set the logical batch size for prompt processing (must be >=32 to use BLAS)
+    nUbatch
+        Set the physical batch size for prompt processing (must be >=32 to use BLAS)
+    nChunks
+        Set the maximal number of chunks to process
+    nSequences
+        Set the number of sequences to decode
+    nGpuLayers
+        Set the number of layers to store in VRAM (-1 - use default)
+    gpuSplitMode
+        Set how to split the model across GPUs
+    mainGpu
+        Set the main GPU that is used for scratch and small tensors.
+    tensorSplit
+        Set how split tensors should be distributed across GPUs
+    grpAttnN
+        Set the group-attention factor
+    grpAttnW
+        Set the group-attention width
+    ropeFreqBase
+        Set the RoPE base frequency, used by NTK-aware scaling
+    ropeFreqScale
+        Set the RoPE frequency scaling factor, expands context by a factor of 1/N
+    yarnExtFactor
+        Set the YaRN extrapolation mix factor
+    yarnAttnFactor
+        Set the YaRN scale sqrt(t) or attention magnitude
+    yarnBetaFast
+        Set the YaRN low correction dim or beta
+    yarnBetaSlow
+        Set the YaRN high correction dim or alpha
+    yarnOrigCtx
+        Set the YaRN original context size of model
+    defragmentationThreshold
+        Set the KV cache defragmentation threshold
+    numaStrategy
+        Set optimization strategies that help on some NUMA systems (if available)
+    ropeScalingType
+        Set the RoPE frequency scaling method, defaults to linear unless specified by the model
+    poolingType
+        Set the pooling type for embeddings, use model default if unspecified
+    flashAttention
+        Whether to enable Flash Attention
+    useMmap
+        Whether to use memory-map model (faster load but may increase pageouts if not using mlock)
+    useMlock
+        Whether to force the system to keep model in RAM rather than swapping or compressing
+    noKvOffload
+        Whether to disable KV offload
+
+    Notes
+    -----
+    To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set
+    the number of GPU layers with the `setNGpuLayers` method.
+
+    When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers`
+    according to your hardware to avoid out-of-memory errors.
+
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> document = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("document")
+    >>> autoGGUFEmbeddings = AutoGGUFEmbeddings.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("embeddings") \\
+    ...     .setBatchSize(4) \\
+    ...     .setNGpuLayers(99) \\
+    ...     .setPoolingType("MEAN")
+    >>> pipeline = Pipeline().setStages([document, autoGGUFEmbeddings])
+    >>> data = spark.createDataFrame([["The moons of Jupiter are 77 in total, with 79 confirmed natural satellites and 2 man-made ones."]]).toDF("text")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.select("embeddings.embeddings").show(truncate = False)
+    +--------------------------------------------------------------------------------+
+    |                                                                      embeddings|
+    +--------------------------------------------------------------------------------+
+    |[[-0.034486726, 0.07770534, -0.15982522, -0.017873349, 0.013914132, 0.0365736...|
+    +--------------------------------------------------------------------------------+
+    """
+
+    name = "AutoGGUFEmbeddings"
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
+    outputAnnotatorType = AnnotatorType.DOCUMENT
+
+    # -------- MODEl PARAMETERS --------
+    nThreads = Param(
+        Params._dummy(),
+        "nThreads",
+        "Set the number of threads to use during generation",
+        typeConverter=TypeConverters.toInt,
+    )
+    nThreadsBatch = Param(
+        Params._dummy(),
+        "nThreadsBatch",
+        "Set the number of threads to use during batch and prompt processing",
+        typeConverter=TypeConverters.toInt,
+    )
+    nCtx = Param(
+        Params._dummy(),
+        "nCtx",
+        "Set the size of the prompt context",
+        typeConverter=TypeConverters.toInt,
+    )
+    nBatch = Param(
+        Params._dummy(),
+        "nBatch",
+        "Set the logical batch size for prompt processing (must be >=32 to use BLAS)",
+        typeConverter=TypeConverters.toInt,
+    )
+    nUbatch = Param(
+        Params._dummy(),
+        "nUbatch",
+        "Set the physical batch size for prompt processing (must be >=32 to use BLAS)",
+        typeConverter=TypeConverters.toInt,
+    )
+    nChunks = Param(
+        Params._dummy(),
+        "nChunks",
+        "Set the maximal number of chunks to process",
+        typeConverter=TypeConverters.toInt,
+    )
+    nSequences = Param(
+        Params._dummy(),
+        "nSequences",
+        "Set the number of sequences to decode",
+        typeConverter=TypeConverters.toInt,
+    )
+    nGpuLayers = Param(
+        Params._dummy(),
+        "nGpuLayers",
+        "Set the number of layers to store in VRAM (-1 - use default)",
+        typeConverter=TypeConverters.toInt,
+    )
+    # Set how to split the model across GPUs
+    #
+    #   - NONE: No GPU split
+    #   - LAYER: Split the model across GPUs by layer
+    #   - ROW: Split the model across GPUs by rows
+    gpuSplitMode = Param(
+        Params._dummy(),
+        "gpuSplitMode",
+        "Set how to split the model across GPUs",
+        typeConverter=TypeConverters.toString,
+    )
+    mainGpu = Param(
+        Params._dummy(),
+        "mainGpu",
+        "Set the main GPU that is used for scratch and small tensors.",
+        typeConverter=TypeConverters.toInt,
+    )
+    tensorSplit = Param(
+        Params._dummy(),
+        "tensorSplit",
+        "Set how split tensors should be distributed across GPUs",
+        typeConverter=TypeConverters.toListFloat,
+    )
+    grpAttnN = Param(
+        Params._dummy(),
+        "grpAttnN",
+        "Set the group-attention factor",
+        typeConverter=TypeConverters.toInt,
+    )
+    grpAttnW = Param(
+        Params._dummy(),
+        "grpAttnW",
+        "Set the group-attention width",
+        typeConverter=TypeConverters.toInt,
+    )
+    ropeFreqBase = Param(
+        Params._dummy(),
+        "ropeFreqBase",
+        "Set the RoPE base frequency, used by NTK-aware scaling",
+        typeConverter=TypeConverters.toFloat,
+    )
+    ropeFreqScale = Param(
+        Params._dummy(),
+        "ropeFreqScale",
+        "Set the RoPE frequency scaling factor, expands context by a factor of 1/N",
+        typeConverter=TypeConverters.toFloat,
+    )
+    yarnExtFactor = Param(
+        Params._dummy(),
+        "yarnExtFactor",
+        "Set the YaRN extrapolation mix factor",
+        typeConverter=TypeConverters.toFloat,
+    )
+    yarnAttnFactor = Param(
+        Params._dummy(),
+        "yarnAttnFactor",
+        "Set the YaRN scale sqrt(t) or attention magnitude",
+        typeConverter=TypeConverters.toFloat,
+    )
+    yarnBetaFast = Param(
+        Params._dummy(),
+        "yarnBetaFast",
+        "Set the YaRN low correction dim or beta",
+        typeConverter=TypeConverters.toFloat,
+    )
+    yarnBetaSlow = Param(
+        Params._dummy(),
+        "yarnBetaSlow",
+        "Set the YaRN high correction dim or alpha",
+        typeConverter=TypeConverters.toFloat,
+    )
+    yarnOrigCtx = Param(
+        Params._dummy(),
+        "yarnOrigCtx",
+        "Set the YaRN original context size of model",
+        typeConverter=TypeConverters.toInt,
+    )
+    defragmentationThreshold = Param(
+        Params._dummy(),
+        "defragmentationThreshold",
+        "Set the KV cache defragmentation threshold",
+        typeConverter=TypeConverters.toFloat,
+    )
+    # Set optimization strategies that help on some NUMA systems (if available)
+    #
+    # Available Strategies:
+    #
+    #   - DISABLED: No NUMA optimizations
+    #   - DISTRIBUTE: Spread execution evenly over all
+    #   - ISOLATE: Only spawn threads on CPUs on the node that execution started on
+    #   - NUMA_CTL: Use the CPU map provided by numactl
+    #   - MIRROR: Mirrors the model across NUMA nodes
+    numaStrategy = Param(
+        Params._dummy(),
+        "numaStrategy",
+        "Set optimization strategies that help on some NUMA systems (if available)",
+        typeConverter=TypeConverters.toString,
+    )
+    # Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
+    #
+    #   - UNSPECIFIED: Don't use any scaling
+    #   - LINEAR: Linear scaling
+    #   - YARN: YaRN RoPE scaling
+    ropeScalingType = Param(
+        Params._dummy(),
+        "ropeScalingType",
+        "Set the RoPE frequency scaling method, defaults to linear unless specified by the model",
+        typeConverter=TypeConverters.toString,
+    )
+    # Set the pooling type for embeddings, use model default if unspecified
+    #
+    #   - 0 UNSPECIFIED: Don't use any pooling
+    #   - 1 MEAN: Mean Pooling
+    #   - 2 CLS: CLS Pooling
+    poolingType = Param(
+        Params._dummy(),
+        "poolingType",
+        "Set the pooling type for embeddings, use model default if unspecified",
+        typeConverter=TypeConverters.toString,
+    )
+    embedding = Param(
+        Params._dummy(),
+        "embedding",
+        "Whether to load model with embedding support",
+        typeConverter=TypeConverters.toBoolean,
+    )
+    flashAttention = Param(
+        Params._dummy(),
+        "flashAttention",
+        "Whether to enable Flash Attention",
+        typeConverter=TypeConverters.toBoolean,
+    )
+    useMmap = Param(
+        Params._dummy(),
+        "useMmap",
+        "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)",
+        typeConverter=TypeConverters.toBoolean,
+    )
+    useMlock = Param(
+        Params._dummy(),
+        "useMlock",
+        "Whether to force the system to keep model in RAM rather than swapping or compressing",
+        typeConverter=TypeConverters.toBoolean,
+    )
+    noKvOffload = Param(
+        Params._dummy(),
+        "noKvOffload",
+        "Whether to disable KV offload",
+        typeConverter=TypeConverters.toBoolean,
+    )
+
+    # -------- MODEL SETTERS --------
+    def setNThreads(self, nThreads: int):
+        """Set the number of threads to use during generation"""
+        return self._set(nThreads=nThreads)
+
+    def setNThreadsBatch(self, nThreadsBatch: int):
+        """Set the number of threads to use during batch and prompt processing"""
+        return self._set(nThreadsBatch=nThreadsBatch)
+
+    def setNCtx(self, nCtx: int):
+        """Set the size of the prompt context"""
+        return self._set(nCtx=nCtx)
+
+    def setNBatch(self, nBatch: int):
+        """Set the logical batch size for prompt processing (must be >=32 to use BLAS)"""
+        return self._set(nBatch=nBatch)
+
+    def setNUbatch(self, nUbatch: int):
+        """Set the physical batch size for prompt processing (must be >=32 to use BLAS)"""
+        return self._set(nUbatch=nUbatch)
+
+    def setNChunks(self, nChunks: int):
+        """Set the maximal number of chunks to process"""
+        return self._set(nChunks=nChunks)
+
+    def setNSequences(self, nSequences: int):
+        """Set the number of sequences to decode"""
+        return self._set(nSequences=nSequences)
+
+    def setNGpuLayers(self, nGpuLayers: int):
+        """Set the number of layers to store in VRAM (-1 - use default)"""
+        return self._set(nGpuLayers=nGpuLayers)
+
+    def setGpuSplitMode(self, gpuSplitMode: str):
+        """Set how to split the model across GPUs"""
+        return self._set(gpuSplitMode=gpuSplitMode)
+
+    def setMainGpu(self, mainGpu: int):
+        """Set the main GPU that is used for scratch and small tensors."""
+        return self._set(mainGpu=mainGpu)
+
+    def setTensorSplit(self, tensorSplit: List[float]):
+        """Set how split tensors should be distributed across GPUs"""
+        return self._set(tensorSplit=tensorSplit)
+
+    def setGrpAttnN(self, grpAttnN: int):
+        """Set the group-attention factor"""
+        return self._set(grpAttnN=grpAttnN)
+
+    def setGrpAttnW(self, grpAttnW: int):
+        """Set the group-attention width"""
+        return self._set(grpAttnW=grpAttnW)
+
+    def setRopeFreqBase(self, ropeFreqBase: float):
+        """Set the RoPE base frequency, used by NTK-aware scaling"""
+        return self._set(ropeFreqBase=ropeFreqBase)
+
+    def setRopeFreqScale(self, ropeFreqScale: float):
+        """Set the RoPE frequency scaling factor, expands context by a factor of 1/N"""
+        return self._set(ropeFreqScale=ropeFreqScale)
+
+    def setYarnExtFactor(self, yarnExtFactor: float):
+        """Set the YaRN extrapolation mix factor"""
+        return self._set(yarnExtFactor=yarnExtFactor)
+
+    def setYarnAttnFactor(self, yarnAttnFactor: float):
+        """Set the YaRN scale sqrt(t) or attention magnitude"""
+        return self._set(yarnAttnFactor=yarnAttnFactor)
+
+    def setYarnBetaFast(self, yarnBetaFast: float):
+        """Set the YaRN low correction dim or beta"""
+        return self._set(yarnBetaFast=yarnBetaFast)
+
+    def setYarnBetaSlow(self, yarnBetaSlow: float):
+        """Set the YaRN high correction dim or alpha"""
+        return self._set(yarnBetaSlow=yarnBetaSlow)
+
+    def setYarnOrigCtx(self, yarnOrigCtx: int):
+        """Set the YaRN original context size of model"""
+        return self._set(yarnOrigCtx=yarnOrigCtx)
+
+    def setDefragmentationThreshold(self, defragmentationThreshold: float):
+        """Set the KV cache defragmentation threshold"""
+        return self._set(defragmentationThreshold=defragmentationThreshold)
+
+    def setNumaStrategy(self, numaStrategy: str):
+        """Set optimization strategies that help on some NUMA systems (if available)"""
+        numaUpper = numaStrategy.upper()
+        numaStrategies = ["DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR"]
+        if numaUpper not in numaStrategies:
+            raise ValueError(
+                f"Invalid NUMA strategy: {numaUpper}. "
+                + f"Valid values are: {numaStrategies}"
+            )
+        return self._set(numaStrategy=numaStrategy)
+
+    def setRopeScalingType(self, ropeScalingType: str):
+        """Set the RoPE frequency scaling method, defaults to linear unless specified by the model"""
+        return self._set(ropeScalingType=ropeScalingType)
+
+    def setPoolingType(self, poolingType: str):
+        """Set the pooling type for embeddings, use model default if unspecified"""
+        poolingTypeUpper = poolingType.upper()
+        poolingTypes = ["NONE", "MEAN", "CLS", "LAST"]
+        if poolingTypeUpper not in poolingTypes:
+            raise ValueError(
+                f"Invalid pooling type: {poolingType}. "
+                + f"Valid values are: {poolingTypes}"
+            )
+        return self._set(poolingType=poolingType)
+
+    def setFlashAttention(self, flashAttention: bool):
+        """Whether to enable Flash Attention"""
+        return self._set(flashAttention=flashAttention)
+
+    def setUseMmap(self, useMmap: bool):
+        """Whether to use memory-map model (faster load but may increase pageouts if not using mlock)"""
+        return self._set(useMmap=useMmap)
+
+    def setUseMlock(self, useMlock: bool):
+        """Whether to force the system to keep model in RAM rather than swapping or compressing"""
+        return self._set(useMlock=useMlock)
+
+    def setNoKvOffload(self, noKvOffload: bool):
+        """Whether to disable KV offload"""
+        return self._set(noKvOffload=noKvOffload)
+
+    def getMetadata(self):
+        """Gets the metadata of the model"""
+        return self._call_java("getMetadata")
+
+    @keyword_only
+    def __init__(
+        self,
+        classname="com.johnsnowlabs.nlp.embeddings.AutoGGUFEmbeddings",
+        java_model=None,
+    ):
+        super(AutoGGUFEmbeddings, self).__init__(
+            classname=classname, java_model=java_model
+        )
+        self._setDefault(
+            embedding=True,
+            nCtx=4096,
+            nBatch=512,
+            poolingType="MEAN",
+        )
+
+    @staticmethod
+    def loadSavedModel(folder, spark_session):
+        """Loads a locally saved model.
+
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+
+        Returns
+        -------
+        AutoGGUFEmbeddings
+            The restored model
+        """
+        from sparknlp.internal import _AutoGGUFEmbeddingsLoader
+
+        jModel = _AutoGGUFEmbeddingsLoader(folder, spark_session._jsparkSession)._java_obj
+        return AutoGGUFEmbeddings(java_model=jModel)
+
+    @staticmethod
+    def pretrained(name="nomic-embed-text-v1.5.Q8_0.gguf", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default "nomic-embed-text-v1.5.Q8_0.gguf"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+
+        Returns
+        -------
+        AutoGGUFEmbeddings
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+
+        return ResourceDownloader.downloadModel(
+            AutoGGUFEmbeddings, name, lang, remote_loc
+        )
diff --git a/python/sparknlp/annotator/seq2seq/auto_gguf_model.py b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py
index 37af88d7dbbe15..d28ac006c9da22 100755
--- a/python/sparknlp/annotator/seq2seq/auto_gguf_model.py
+++ b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py
@@ -199,7 +199,6 @@ class AutoGGUFModel(AnnotatorModel, HasBatchedAnnotate):
     useChatTemplate
         Set whether or not generate should apply a chat template
 
-
     Notes
     -----
     To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set
@@ -208,29 +207,6 @@ class AutoGGUFModel(AnnotatorModel, HasBatchedAnnotate):
     When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers`
     according to your hardware to avoid out-of-memory errors.
 
-    References
-    ----------
-    - `Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension
-     <https://arxiv.org/abs/1910.13461>`__
-    - https://github.com/pytorch/fairseq
-
-    **Paper Abstract:**
-    *We present BART, a denoising autoencoder for pretraining sequence-to-sequence models.
-    BART is trained by (1) corrupting text with an arbitrary noising function, and (2)
-    learning a model to reconstruct the original text. It uses a standard Tranformer-based
-    neural machine translation architecture which, despite its simplicity, can be seen as
-    generalizing BERT (due to the bidirectional encoder), GPT (with the left-to-right decoder),
-    and many other more recent pretraining schemes. We evaluate a number of noising approaches,
-    finding the best performance by both randomly shuffling the order of the original sentences
-    and using a novel in-filling scheme, where spans of text are replaced with a single mask token.
-    BART is particularly effective when fine tuned for text generation but also works well for
-    comprehension tasks. It matches the performance of RoBERTa with comparable training resources
-    on GLUE and SQuAD, achieves new state-of-the-art results on a range of abstractive dialogue,
-    question answering, and summarization tasks, with gains of up to 6 ROUGE. BART also provides
-    a 1.1 BLEU increase over a back-translation system for machine translation, with only target
-    language pretraining. We also report ablation experiments that replicate other pretraining
-    schemes within the BART framework, to better measure which factors most influence end-task performance.*
-
     Examples
     --------
     >>> import sparknlp
@@ -553,6 +529,13 @@ def setDefragmentationThreshold(self, defragmentationThreshold: float):
 
     def setNumaStrategy(self, numaStrategy: str):
         """Set optimization strategies that help on some NUMA systems (if available)"""
+        numaUpper = numaStrategy.upper()
+        numaStrategies = ["DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR"]
+        if numaUpper not in numaStrategies:
+            raise ValueError(
+                f"Invalid NUMA strategy: {numaUpper}. "
+                + f"Valid values are: {numaStrategies}"
+            )
         return self._set(numaStrategy=numaStrategy)
 
     def setRopeScalingType(self, ropeScalingType: str):
@@ -561,6 +544,13 @@ def setRopeScalingType(self, ropeScalingType: str):
 
     def setPoolingType(self, poolingType: bool):
         """Set the pooling type for embeddings, use model default if unspecified"""
+        poolingTypeUpper = poolingType.upper()
+        poolingTypes = ["NONE", "MEAN", "CLS", "LAST"]
+        if poolingTypeUpper not in poolingTypes:
+            raise ValueError(
+                f"Invalid pooling type: {poolingType}. "
+                + f"Valid values are: {poolingTypes}"
+            )
         return self._set(poolingType=poolingType)
 
     def setModelDraft(self, modelDraft: str):
diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py
index eec3544dc41c6f..4cb5321e8a8691 100644
--- a/python/sparknlp/internal/__init__.py
+++ b/python/sparknlp/internal/__init__.py
@@ -1007,10 +1007,17 @@ def __init__(self, path, jspark):
             "com.johnsnowlabs.nlp.embeddings.SnowFlakeEmbeddings.loadSavedModel", path, jspark
         )
 
+
+class _AutoGGUFEmbeddingsLoader(ExtendedJavaWrapper):
+    def __init__(self, path, jspark):
+        super(_AutoGGUFEmbeddingsLoader, self).__init__(
+            "com.johnsnowlabs.nlp.embeddings.AutoGGUFEmbeddings.loadSavedModel", path, jspark)
+
+
 class _BLIPForQuestionAnswering(ExtendedJavaWrapper):
     def __init__(self, path, jspark):
         super(_BLIPForQuestionAnswering, self).__init__(
             "com.johnsnowlabs.nlp.annotators.cv.BLIPForQuestionAnswering.loadSavedModel",
             path,
             jspark,
-        )
\ No newline at end of file
+        )
diff --git a/python/test/annotator/embeddings/auto_gguf_embeddings_test.py b/python/test/annotator/embeddings/auto_gguf_embeddings_test.py
new file mode 100644
index 00000000000000..72b82c19b6e830
--- /dev/null
+++ b/python/test/annotator/embeddings/auto_gguf_embeddings_test.py
@@ -0,0 +1,106 @@
+#  Copyright 2017-2023 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import unittest
+
+import pytest
+
+from sparknlp.annotator import *
+from sparknlp.base import *
+from test.util import SparkContextForTest
+
+
+@pytest.mark.slow
+class AutoGGUFModelTestSpec(unittest.TestCase):
+    def setUp(self):
+        self.spark = SparkContextForTest.spark
+        self.data = (
+            self.spark.createDataFrame(
+                [
+                    ["The moons of Jupiter are "],
+                    ["Earth is "],
+                    ["The moon is "],
+                    ["The sun is "],
+                ]
+            )
+            .toDF("text")
+            .repartition(1)
+        )
+        self.document_assembler = (
+            DocumentAssembler().setInputCol("text").setOutputCol("document")
+        )
+
+    def runTest(self):
+        model = (
+            AutoGGUFEmbeddings.pretrained()
+            .setInputCols("document")
+            .setOutputCol("embeddings")
+            .setBatchSize(4)
+            .setNGpuLayers(99)
+        )
+
+        pipeline = Pipeline().setStages([self.document_assembler, model])
+        results = pipeline.fit(self.data).transform(self.data)
+        collected = results.select("embeddings.embeddings").collect()
+
+        for row in collected:
+            embds = row["embeddings"][0]
+            assert embds is not None
+            assert (
+                sum(embds) > 0
+            ), "Embeddings should not be zero. Was there an error on llama.cpp side?"
+
+
+@pytest.mark.slow
+class AutoGGUFEmbeddingsPoolingTypeTestSpec(unittest.TestCase):
+    def setUp(self):
+        self.spark = SparkContextForTest.spark
+        self.data = (
+            self.spark.createDataFrame(
+                [
+                    ["The moons of Jupiter are "],
+                    ["Earth is "],
+                    ["The moon is "],
+                    ["The sun is "],
+                ]
+            )
+            .toDF("text")
+            .repartition(1)
+        )
+        self.document_assembler = (
+            DocumentAssembler().setInputCol("text").setOutputCol("document")
+        )
+
+    def runTest(self):
+        model = (
+            # AutoGGUFEmbeddings.pretrained()
+            AutoGGUFEmbeddings.loadSavedModel(
+                "models/nomic-embed-text-v1.5.Q8_0.gguf", SparkContextForTest.spark
+            )
+            .setInputCols("document")
+            .setOutputCol("embeddings")
+            .setBatchSize(4)
+            .setNGpuLayers(99)
+            .setPoolingType("CLS")
+        )
+
+        pipeline = Pipeline().setStages([self.document_assembler, model])
+        results = pipeline.fit(self.data).transform(self.data)
+        collected = results.select("embeddings.embeddings").collect()
+
+        for row in collected:
+            embds = row["embeddings"][0]
+            assert embds is not None
+            assert (
+                sum(embds) > 0
+            ), "Embeddings should not be zero. Was there an error on llama.cpp side?"
diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala
new file mode 100644
index 00000000000000..e200610b38a2a9
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala
@@ -0,0 +1,572 @@
+package com.johnsnowlabs.nlp
+
+import com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel
+import com.johnsnowlabs.nlp.llama.InferenceParameters
+import com.johnsnowlabs.nlp.llama.args._
+import com.johnsnowlabs.nlp.serialization.StructFeature
+import org.apache.spark.ml.param._
+
+import scala.collection.mutable
+import scala.jdk.CollectionConverters._
+
+/** Contains settable inference parameters for the [[AutoGGUFModel]].
+  *
+  * @groupname param Parameters
+  * @groupname setParam Parameter setters
+  * @groupname getParam Parameter getters
+  * @groupprio setParam  1
+  * @groupprio getParam  2
+  * @groupprio param  3
+  * @groupdesc param
+  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
+  *   parameter values through setters and getters, respectively.
+  */
+trait HasLlamaCppInferenceProperties {
+  this: ParamsAndFeaturesWritable with HasProtectedParams =>
+
+  /** @group param */
+  val inputPrefix =
+    new Param[String](this, "inputPrefix", "Set the prompt to start generation with")
+
+  /** @group param */
+  val inputSuffix =
+    new Param[String](this, "inputSuffix", "Set a suffix for infilling")
+
+  /** @group param */
+  val cachePrompt = new BooleanParam(
+    this,
+    "cachePrompt",
+    "Whether to remember the prompt to avoid reprocessing it")
+
+  /** @group param */
+  val nPredict = new IntParam(this, "nPredict", "Set the number of tokens to predict")
+
+  /** @group param */
+  val topK = new IntParam(this, "topK", "Set top-k sampling")
+
+  /** @group param */
+  val topP = new FloatParam(this, "topP", "Set top-p sampling")
+
+  /** @group param */
+  val minP = new FloatParam(this, "minP", "Set min-p sampling")
+
+  /** @group param */
+  val tfsZ = new FloatParam(this, "tfsZ", "Set tail free sampling, parameter z")
+
+  /** @group param */
+  val typicalP = new FloatParam(this, "typicalP", "Set locally typical sampling, parameter p")
+
+  /** @group param */
+  val temperature = new FloatParam(this, "temperature", "Set the temperature")
+
+  /** @group param */
+  val dynamicTemperatureRange =
+    new FloatParam(this, "dynatempRange", "Set the dynamic temperature range")
+
+  /** @group param */
+  val dynamicTemperatureExponent =
+    new FloatParam(this, "dynatempExponent", "Set the dynamic temperature exponent")
+
+  /** @group param */
+  val repeatLastN =
+    new IntParam(this, "repeatLastN", "Set the last n tokens to consider for penalties")
+
+  /** @group param */
+  val repeatPenalty =
+    new FloatParam(this, "repeatPenalty", "Set the penalty of repeated sequences of tokens")
+
+  /** @group param */
+  val frequencyPenalty =
+    new FloatParam(this, "frequencyPenalty", "Set the repetition alpha frequency penalty")
+
+  /** @group param */
+  val presencePenalty =
+    new FloatParam(this, "presencePenalty", "Set the repetition alpha presence penalty")
+
+  /** @group param */
+  val miroStat = new Param[String](this, "miroStat", "Set MiroStat sampling strategies.")
+
+  /** @group param */
+  val miroStatTau =
+    new FloatParam(this, "mirostatTau", "Set the MiroStat target entropy, parameter tau")
+
+  /** @group param */
+  val miroStatEta =
+    new FloatParam(this, "mirostatEta", "Set the MiroStat learning rate, parameter eta")
+
+  /** @group param */
+  val penalizeNl = new BooleanParam(this, "penalizeNl", "Whether to penalize newline tokens")
+
+  /** @group param */
+  val nKeep =
+    new IntParam(this, "nKeep", "Set the number of tokens to keep from the initial prompt")
+
+  /** @group param */
+  val seed = new IntParam(this, "seed", "Set the RNG seed")
+
+  /** @group param */
+  val nProbs = new IntParam(
+    this,
+    "nProbs",
+    "Set the amount top tokens probabilities to output if greater than 0.")
+
+  /** @group param */
+  val minKeep = new IntParam(
+    this,
+    "minKeep",
+    "Set the amount of tokens the samplers should return at least (0 = disabled)")
+
+  /** @group param */
+  val grammar =
+    new Param[String](this, "grammar", "Set BNF-like grammar to constrain generations")
+
+  /** @group param */
+  val penaltyPrompt = new Param[String](
+    this,
+    "penaltyPrompt",
+    "Override which part of the prompt is penalized for repetition.")
+
+  /** @group param */
+  val ignoreEos = new BooleanParam(
+    this,
+    "ignoreEos",
+    "Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)")
+
+  // Modify the likelihood of tokens appearing in the completion by their id.
+  val tokenIdBias: StructFeature[Map[Int, Float]] =
+    new StructFeature[Map[Int, Float]](this, "tokenIdBias")
+
+  // Modify the likelihood of tokens appearing in the completion by their string.
+  /** @group param */
+  val tokenBias: StructFeature[Map[String, Float]] =
+    new StructFeature[Map[String, Float]](this, "tokenBias")
+
+  /** @group param */
+  val disableTokenIds =
+    new IntArrayParam(this, "disableTokenIds", "Set the token ids to disable in the completion")
+
+  /** @group param */
+  val stopStrings = new StringArrayParam(
+    this,
+    "stopStrings",
+    "Set strings upon seeing which token generation is stopped")
+
+  /** @group param */
+  val samplers = new StringArrayParam(
+    this,
+    "samplers",
+    "Set which samplers to use for token generation in the given order")
+
+  /** @group param */
+  val useChatTemplate = new BooleanParam(
+    this,
+    "useChatTemplate",
+    "Set whether or not generate should apply a chat template")
+
+  /** Set the prompt to start generation with
+    *
+    * @group setParam
+    */
+  def setInputPrefix(inputPrefix: String): this.type = { set(this.inputPrefix, inputPrefix) }
+
+  /** Set a suffix for infilling
+    *
+    * @group setParam
+    */
+  def setInputSuffix(inputSuffix: String): this.type = { set(this.inputSuffix, inputSuffix) }
+
+  /** Whether to remember the prompt to avoid reprocessing it
+    *
+    * @group setParam
+    */
+  def setCachePrompt(cachePrompt: Boolean): this.type = { set(this.cachePrompt, cachePrompt) }
+
+  /** Set the number of tokens to predict
+    *
+    * @group setParam
+    */
+  def setNPredict(nPredict: Int): this.type = { set(this.nPredict, nPredict) }
+
+  /** Set top-k sampling
+    *
+    * @group setParam
+    */
+  def setTopK(topK: Int): this.type = { set(this.topK, topK) }
+
+  /** Set top-p sampling
+    *
+    * @group setParam
+    */
+  def setTopP(topP: Float): this.type = { set(this.topP, topP) }
+
+  /** Set min-p sampling
+    *
+    * @group setParam
+    */
+  def setMinP(minP: Float): this.type = { set(this.minP, minP) }
+
+  /** Set tail free sampling, parameter z
+    * @group setParam
+    */
+  def setTfsZ(tfsZ: Float): this.type = { set(this.tfsZ, tfsZ) }
+
+  /** Set locally typical sampling, parameter p
+    *
+    * @group setParam
+    */
+  def setTypicalP(typicalP: Float): this.type = { set(this.typicalP, typicalP) }
+
+  /** Set the temperature
+    *
+    * @group setParam
+    */
+  def setTemperature(temperature: Float): this.type = { set(this.temperature, temperature) }
+
+  /** Set the dynamic temperature range
+    *
+    * @group setParam
+    */
+  def setDynamicTemperatureRange(dynatempRange: Float): this.type = {
+    set(this.dynamicTemperatureRange, dynatempRange)
+  }
+
+  /** Set the dynamic temperature exponent
+    *
+    * @group setParam
+    */
+  def setDynamicTemperatureExponent(dynatempExponent: Float): this.type = {
+    set(this.dynamicTemperatureExponent, dynatempExponent)
+  }
+
+  /** Set the last n tokens to consider for penalties
+    *
+    * @group setParam
+    */
+  def setRepeatLastN(repeatLastN: Int): this.type = { set(this.repeatLastN, repeatLastN) }
+
+  /** Set the penalty of repeated sequences of tokens
+    *
+    * @group setParam
+    */
+  def setRepeatPenalty(repeatPenalty: Float): this.type = {
+    set(this.repeatPenalty, repeatPenalty)
+  }
+
+  /** Set the repetition alpha frequency penalty
+    *
+    * @group setParam
+    */
+  def setFrequencyPenalty(frequencyPenalty: Float): this.type = {
+    set(this.frequencyPenalty, frequencyPenalty)
+  }
+
+  /** Set the repetition alpha presence penalty
+    *
+    * @group setParam
+    */
+  def setPresencePenalty(presencePenalty: Float): this.type = {
+    set(this.presencePenalty, presencePenalty)
+  }
+
+  /** Set MiroStat sampling strategies.
+    *
+    *   - DISABLED: No MiroStat
+    *   - V1: MiroStat V1
+    *   - V2: MiroStat V2
+    *
+    * @group setParam
+    */
+  def setMiroStat(mirostat: String): this.type = set(this.miroStat, mirostat)
+
+  /** Set the MiroStat target entropy, parameter tau
+    *
+    * @group setParam
+    */
+  def setMiroStatTau(mirostatTau: Float): this.type = { set(this.miroStatTau, mirostatTau) }
+
+  /** Set the MiroStat learning rate, parameter eta
+    *
+    * @group setParam
+    */
+  def setMiroStatEta(mirostatEta: Float): this.type = { set(this.miroStatEta, mirostatEta) }
+
+  /** Set whether to penalize newline tokens
+    *
+    * @group setParam
+    */
+  def setPenalizeNl(penalizeNl: Boolean): this.type = { set(this.penalizeNl, penalizeNl) }
+
+  /** Set the number of tokens to keep from the initial prompt
+    *
+    * @group setParam
+    */
+  def setNKeep(nKeep: Int): this.type = { set(this.nKeep, nKeep) }
+
+  /** Set the RNG seed
+    *
+    * @group setParam
+    */
+  def setSeed(seed: Int): this.type = { set(this.seed, seed) }
+
+  /** Set the amount top tokens probabilities to output if greater than 0.
+    *
+    * @group setParam
+    */
+  def setNProbs(nProbs: Int): this.type = { set(this.nProbs, nProbs) }
+
+  /** Set the amount of tokens the samplers should return at least (0 = disabled)
+    *
+    * @group setParam
+    */
+  def setMinKeep(minKeep: Int): this.type = { set(this.minKeep, minKeep) }
+
+  /** Set BNF-like grammar to constrain generations
+    *
+    * @group setParam
+    */
+  def setGrammar(grammar: String): this.type = { set(this.grammar, grammar) }
+
+  /** Override which part of the prompt is penalized for repetition.
+    *
+    * @group setParam
+    */
+  def setPenaltyPrompt(penaltyPrompt: String): this.type = {
+    set(this.penaltyPrompt, penaltyPrompt)
+  }
+
+  /** Set whether to ignore end of stream token and continue generating (implies --logit-bias
+    * 2-inf)
+    *
+    * @group setParam
+    */
+  def setIgnoreEos(ignoreEos: Boolean): this.type = { set(this.ignoreEos, ignoreEos) }
+
+  /** Set the tokens to disable during completion.
+    *
+    * @group setParam
+    */
+  def setTokenBias(tokenBias: Map[String, Float]): this.type = {
+    set(this.tokenBias, tokenBias)
+  }
+
+  /** Set the tokens to disable during completion. (Override for PySpark)
+    *
+    * @group setParam
+    */
+  def setTokenBias(tokenBias: java.util.HashMap[String, java.lang.Double]): this.type = {
+    val scalaTokenBias = tokenBias.asScala.map { case (k, v) => k -> v.floatValue() }
+    set(this.tokenBias, scalaTokenBias.toMap)
+  }
+
+  /** Set the token ids to disable in the completion.
+    *
+    * @group setParam
+    */
+  def setTokenIdBias(tokenIdBias: Map[Int, Float]): this.type = {
+    set(this.tokenIdBias, tokenIdBias)
+  }
+
+  /** Set the token ids to disable in the completion. (Override for PySpark)
+    *
+    * @group setParam
+    */
+  def setTokenIdBias(tokenIdBias: java.util.HashMap[Integer, java.lang.Double]): this.type = {
+    val scalaTokenIdBias = tokenIdBias.asScala.map { case (k, v) => k.toInt -> v.toFloat }
+    set(this.tokenIdBias, scalaTokenIdBias.toMap)
+  }
+
+  /** Set the token ids to disable in the completion. This corresponds to `setTokenBias` with a
+    * value of `Float.NEGATIVE_INFINITY`.
+    *
+    * @group setParam
+    */
+  def setDisableTokenIds(disableTokenIds: Array[Int]): this.type = {
+    set(this.disableTokenIds, disableTokenIds)
+  }
+
+  /** Set strings upon seeing which token generation is stopped
+    *
+    * @group setParam
+    */
+  def setStopStrings(stopStrings: Array[String]): this.type = {
+    set(this.stopStrings, stopStrings)
+  }
+
+  /** Set which samplers to use for token generation in the given order .
+    *
+    * Available Samplers are:
+    *
+    *   - TOP_K: Top-k sampling
+    *   - TFS_Z: Tail free sampling
+    *   - TYPICAL_P: Locally typical sampling p
+    *   - TOP_P: Top-p sampling
+    *   - MIN_P: Min-p sampling
+    *   - TEMPERATURE: Temperature sampling
+    * @group setParam
+    */
+  def setSamplers(samplers: Array[String]): this.type = { set(this.samplers, samplers) }
+
+  /** Set whether or not generate should apply a chat template
+    *
+    * @group setParam
+    */
+  def setUseChatTemplate(useChatTemplate: Boolean): this.type = {
+    set(this.useChatTemplate, useChatTemplate)
+  }
+
+  // ---------------- GETTERS ----------------
+  /** @group getParam */
+  def getInputPrefix: String = $(inputPrefix)
+
+  /** @group getParam */
+  def getInputSuffix: String = $(inputSuffix)
+
+  /** @group getParam */
+  def getCachePrompt: Boolean = $(cachePrompt)
+
+  def getNPredict: Int = $(nPredict)
+
+  /** @group getParam */
+  def getTopK: Int = $(topK)
+
+  /** @group getParam */
+  def getTopP: Float = $(topP)
+
+  /** @group getParam */
+  def getMinP: Float = $(minP)
+
+  /** @group getParam */
+  def getTfsZ: Float = $(tfsZ)
+
+  /** @group getParam */
+  def getTypicalP: Float = $(typicalP)
+
+  /** @group getParam */
+  def getTemperature: Float = $(temperature)
+
+  /** @group getParam */
+  def getDynamicTemperatureRange: Float = $(dynamicTemperatureRange)
+
+  /** @group getParam */
+  def getDynamicTemperatureExponent: Float = $(dynamicTemperatureExponent)
+
+  /** @group getParam */
+  def getRepeatLastN: Int = $(repeatLastN)
+
+  /** @group getParam */
+  def getRepeatPenalty: Float = $(repeatPenalty)
+
+  /** @group getParam */
+  def getFrequencyPenalty: Float = $(frequencyPenalty)
+
+  /** @group getParam */
+  def getPresencePenalty: Float = $(presencePenalty)
+
+  /** @group getParam */
+  def getMiroStat: String = $(miroStat)
+
+  /** @group getParam */
+  def getMiroStatTau: Float = $(miroStatTau)
+
+  /** @group getParam */
+  def getMiroStatEta: Float = $(miroStatEta)
+
+  /** @group getParam */
+  def getPenalizeNl: Boolean = $(penalizeNl)
+
+  /** @group getParam */
+  def getNKeep: Int = $(nKeep)
+
+  /** @group getParam */
+  def getSeed: Int = $(seed)
+
+  /** @group getParam */
+  def getNProbs: Int = $(nProbs)
+
+  /** @group getParam */
+  def getMinKeep: Int = $(minKeep)
+
+  /** @group getParam */
+  def getGrammar: String = $(grammar)
+
+  /** @group getParam */
+  def getPenaltyPrompt: String = $(penaltyPrompt)
+
+  /** @group getParam */
+  def getIgnoreEos: Boolean = $(ignoreEos)
+
+  /** @group getParam */
+  def getTokenIdBias: Map[Int, Float] = $$(tokenIdBias)
+
+  /** @group getParam */
+  def getTokenBias: Map[String, Float] = $$(tokenBias)
+
+  /** @group getParam */
+  def getDisableTokenIds: Array[Int] = $(disableTokenIds)
+
+  /** @group getParam */
+  def getStopStrings: Array[String] = $(stopStrings)
+
+  /** @group getParam */
+  def getSamplers: Array[String] = $(samplers)
+
+  /** @group getParam */
+  def getUseChatTemplate: Boolean = $(useChatTemplate)
+
+  protected def getInferenceParameters: InferenceParameters = {
+    val inferenceParams = new InferenceParameters("")
+    if (isDefined(cachePrompt)) inferenceParams.setCachePrompt(getCachePrompt)
+    if (isDefined(disableTokenIds)) {
+      val javaCollection: java.util.Collection[Integer] =
+        getDisableTokenIds.map(int2Integer).toSeq.asJava
+      inferenceParams.disableTokenIds(javaCollection)
+    }
+    if (isDefined(dynamicTemperatureExponent))
+      inferenceParams.setDynamicTemperatureExponent(getDynamicTemperatureExponent)
+    if (isDefined(dynamicTemperatureRange))
+      inferenceParams.setDynamicTemperatureRange(getDynamicTemperatureRange)
+    if (isDefined(frequencyPenalty)) inferenceParams.setFrequencyPenalty(getFrequencyPenalty)
+    if (isDefined(grammar)) inferenceParams.setGrammar(getGrammar)
+    if (isDefined(ignoreEos)) inferenceParams.setIgnoreEos(getIgnoreEos)
+    if (isDefined(inputPrefix)) inferenceParams.setInputPrefix(getInputPrefix)
+    if (isDefined(inputSuffix)) inferenceParams.setInputSuffix(getInputSuffix)
+    if (isDefined(minKeep)) inferenceParams.setMinKeep(getMinKeep)
+    if (isDefined(minP)) inferenceParams.setMinP(getMinP)
+    if (isDefined(miroStat)) inferenceParams.setMiroStat(MiroStat.valueOf(getMiroStat))
+    if (isDefined(miroStatEta)) inferenceParams.setMiroStatEta(getMiroStatEta)
+    if (isDefined(miroStatTau)) inferenceParams.setMiroStatTau(getMiroStatTau)
+    if (isDefined(nKeep)) inferenceParams.setNKeep(getNKeep)
+    if (isDefined(nPredict)) inferenceParams.setNPredict(getNPredict)
+    if (isDefined(nProbs)) inferenceParams.setNProbs(getNProbs)
+    if (isDefined(penalizeNl)) inferenceParams.setPenalizeNl(getPenalizeNl)
+    if (isDefined(penaltyPrompt)) inferenceParams.setPenaltyPrompt(getPenaltyPrompt)
+    if (isDefined(presencePenalty)) inferenceParams.setPresencePenalty(getPresencePenalty)
+    if (isDefined(repeatLastN)) inferenceParams.setRepeatLastN(getRepeatLastN)
+    if (isDefined(repeatPenalty)) inferenceParams.setRepeatPenalty(getRepeatPenalty)
+    if (isDefined(samplers)) inferenceParams.setSamplers(getSamplers.map(Sampler.valueOf): _*)
+    if (isDefined(seed)) inferenceParams.setSeed(getSeed)
+    if (isDefined(stopStrings)) inferenceParams.setStopStrings(getStopStrings: _*)
+    if (isDefined(temperature)) inferenceParams.setTemperature(getTemperature)
+    if (isDefined(tfsZ)) inferenceParams.setTfsZ(getTfsZ)
+    if (isDefined(topK)) inferenceParams.setTopK(getTopK)
+    if (isDefined(topP)) inferenceParams.setTopP(getTopP)
+    if (isDefined(typicalP)) inferenceParams.setTypicalP(getTypicalP)
+    if (isDefined(useChatTemplate)) inferenceParams.setUseChatTemplate(getUseChatTemplate)
+    if (tokenBias.isSet) {
+      val tokenBiasMap: mutable.Map[String, java.lang.Float] = mutable.Map(getTokenBias.map {
+        case (key, value) => (key, float2Float(value))
+      }.toSeq: _*)
+      inferenceParams.setTokenBias(tokenBiasMap.asJava)
+    }
+    if (tokenIdBias.isSet) {
+      val tokenIdBiasMap: mutable.Map[Integer, java.lang.Float] =
+        mutable.Map(getTokenIdBias.map { case (key, value) =>
+          (int2Integer(key), float2Float(value))
+        }.toSeq: _*)
+      inferenceParams.setTokenIdBias(tokenIdBiasMap.asJava)
+    }
+
+    inferenceParams
+  }
+
+}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppModelProperties.scala b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppModelProperties.scala
new file mode 100644
index 00000000000000..e71a7b999f25c2
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppModelProperties.scala
@@ -0,0 +1,853 @@
+package com.johnsnowlabs.nlp
+
+import com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel
+import com.johnsnowlabs.nlp.llama.ModelParameters
+import com.johnsnowlabs.nlp.llama.args.{GpuSplitMode, NumaStrategy, PoolingType, RopeScalingType}
+import com.johnsnowlabs.nlp.serialization.StructFeature
+import org.apache.spark.ml.param._
+import org.apache.spark.sql.SparkSession
+import org.json4s.DefaultFormats
+import org.json4s.jackson.JsonMethods
+import org.slf4j.LoggerFactory
+
+import scala.collection.mutable
+import scala.jdk.CollectionConverters._
+
+/** Contains settable model parameters for the [[AutoGGUFModel]].
+  *
+  * @groupname param Parameters
+  * @groupname setParam Parameter setters
+  * @groupname getParam Parameter getters
+  * @groupprio setParam  1
+  * @groupprio getParam  2
+  * @groupprio param  3
+  * @groupdesc param
+  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
+  *   parameter values through setters and getters, respectively.
+  */
+trait HasLlamaCppModelProperties {
+  this: ParamsAndFeaturesWritable with HasProtectedParams =>
+  protected val logger = LoggerFactory.getLogger(this.getClass)
+
+  /** @group param */
+  val nThreads =
+    new IntParam(this, "nThreads", "Set the number of threads to use during generation")
+
+  /** @group param */
+  val nThreadsDraft = new IntParam(
+    this,
+    "nThreadsDraft",
+    "Set the number of threads to use during draft generation")
+
+  /** @group param */
+  val nThreadsBatch = new IntParam(
+    this,
+    "nThreadsBatch",
+    "Set the number of threads to use during batch and prompt processing")
+
+  /** @group param */
+  val nThreadsBatchDraft = new IntParam(
+    this,
+    "nThreadsBatchDraft",
+    "Set the number of threads to use during batch and prompt processing")
+
+  /** @group param */
+  val nCtx = new IntParam(this, "nCtx", "Set the size of the prompt context")
+
+  /** @group param */
+  val nBatch = new IntParam(
+    this,
+    "nBatch",
+    "Set the logical batch size for prompt processing (must be >=32 to use BLAS)")
+
+  /** @group param */
+  val nUbatch = new IntParam(
+    this,
+    "nUbatch",
+    "Set the physical batch size for prompt processing (must be >=32 to use BLAS)")
+
+  /** @group param */
+  val nDraft =
+    new IntParam(this, "nDraft", "Set the number of tokens to draft for speculative decoding")
+
+  /** @group param */
+  val nChunks = new IntParam(this, "nChunks", "Set the maximal number of chunks to process")
+
+  /** @group param */
+  val nSequences =
+    new IntParam(this, "nSequences", "Set the number of sequences to decode")
+
+  /** @group param */
+  val pSplit = new FloatParam(this, "pSplit", "Set the speculative decoding split probability")
+
+  /** @group param */
+  val nGpuLayers = new IntParam(
+    this,
+    "nGpuLayers",
+    "Set the number of layers to store in VRAM (-1 - use default)")
+
+  /** @group param */
+  val nGpuLayersDraft = new IntParam(
+    this,
+    "nGpuLayersDraft",
+    "Set the number of layers to store in VRAM for the draft model (-1 - use default)")
+
+  /** Set how to split the model across GPUs
+    *
+    *   - NONE: No GPU split
+    *   - LAYER: Split the model across GPUs by layer
+    *   - ROW: Split the model across GPUs by rows
+    *
+    * @group param
+    */
+  val gpuSplitMode =
+    new Param[String](this, "gpuSplitMode", "Set how to split the model across GPUs")
+
+  /** @group param */
+  val mainGpu =
+    new IntParam(this, "mainGpu", "Set the main GPU that is used for scratch and small tensors.")
+
+  /** @group param */
+  val tensorSplit = new DoubleArrayParam(
+    this,
+    "tensorSplit",
+    "Set how split tensors should be distributed across GPUs")
+
+  /** @group param */
+  val grpAttnN = new IntParam(this, "grpAttnN", "Set the group-attention factor")
+
+  /** @group param */
+  val grpAttnW = new IntParam(this, "grpAttnW", "Set the group-attention width")
+
+  /** @group param */
+  val ropeFreqBase =
+    new FloatParam(this, "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling")
+
+  /** @group param */
+  val ropeFreqScale = new FloatParam(
+    this,
+    "ropeFreqScale",
+    "Set the RoPE frequency scaling factor, expands context by a factor of 1/N")
+
+  /** @group param */
+  val yarnExtFactor =
+    new FloatParam(this, "yarnExtFactor", "Set the YaRN extrapolation mix factor")
+
+  /** @group param */
+  val yarnAttnFactor =
+    new FloatParam(this, "yarnAttnFactor", "Set the YaRN scale sqrt(t) or attention magnitude")
+
+  /** @group param */
+  val yarnBetaFast =
+    new FloatParam(this, "yarnBetaFast", "Set the YaRN low correction dim or beta")
+
+  /** @group param */
+  val yarnBetaSlow =
+    new FloatParam(this, "yarnBetaSlow", "Set the YaRN high correction dim or alpha")
+
+  /** @group param */
+  val yarnOrigCtx =
+    new IntParam(this, "yarnOrigCtx", "Set the YaRN original context size of model")
+
+  /** @group param */
+  val defragmentationThreshold =
+    new FloatParam(this, "defragmentationThreshold", "Set the KV cache defragmentation threshold")
+
+  /** Set optimization strategies that help on some NUMA systems (if available)
+    *
+    * Available Strategies:
+    *
+    *   - DISABLED: No NUMA optimizations
+    *   - DISTRIBUTE: Spread execution evenly over all
+    *   - ISOLATE: Only spawn threads on CPUs on the node that execution started on
+    *   - NUMA_CTL: Use the CPU map provided by numactl
+    *   - MIRROR: Mirrors the model across NUMA nodes
+    *
+    * @group param
+    */
+  val numaStrategy = new Param[String](
+    this,
+    "numaStrategy",
+    "Set optimization strategies that help on some NUMA systems (if available)")
+
+  /** Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
+    *
+    *   - UNSPECIFIED: Don't use any scaling
+    *   - LINEAR: Linear scaling
+    *   - YARN: YaRN RoPE scaling
+    *
+    * @group param
+    */
+  val ropeScalingType = new Param[String](
+    this,
+    "ropeScalingType",
+    "Set the RoPE frequency scaling method, defaults to linear unless specified by the model")
+
+  /** Set the pooling type for embeddings, use model default if unspecified
+    *
+    *   - 0 NONE: Don't use any pooling
+    *   - 1 MEAN: Mean Pooling
+    *   - 2 CLS: Choose the CLS token
+    *   - 3 LAST: Choose the last token
+    *
+    * @group param
+    */
+  val poolingType = new Param[String](
+    this,
+    "poolingType",
+    "Set the pooling type for embeddings, use model default if unspecified")
+
+  /** @group param */
+  val modelDraft =
+    new Param[String](this, "modelDraft", "Set the draft model for speculative decoding")
+
+  /** @group param */
+  val lookupCacheStaticFilePath = new Param[String](
+    this,
+    "lookupCacheStaticFilePath",
+    "Set path to static lookup cache to use for lookup decoding (not updated by generation)")
+
+  /** @group param */
+  val lookupCacheDynamicFilePath = new Param[String](
+    this,
+    "lookupCacheDynamicFilePath",
+    "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)")
+
+  /** @group param */
+  val loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters")
+
+  /** @group param */
+  val embedding =
+    new BooleanParam(this, "embedding", "Whether to load model with embedding support")
+
+  /** @group param */
+  val flashAttention =
+    new BooleanParam(this, "flashAttention", "Whether to enable Flash Attention")
+
+  /** @group param */
+  val inputPrefixBos = new BooleanParam(
+    this,
+    "inputPrefixBos",
+    "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string")
+
+  /** @group param */
+  val useMmap = new BooleanParam(
+    this,
+    "useMmap",
+    "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)")
+
+  /** @group param */
+  val useMlock = new BooleanParam(
+    this,
+    "useMlock",
+    "Whether to force the system to keep model in RAM rather than swapping or compressing")
+
+  /** @group param */
+  val noKvOffload = new BooleanParam(this, "noKvOffload", "Whether to disable KV offload")
+
+  /** @group param */
+  val systemPrompt = new Param[String](this, "systemPrompt", "Set a system prompt to use")
+
+  /** @group param */
+  val chatTemplate =
+    new Param[String](this, "chatTemplate", "The chat template to use")
+
+  private def checkEmbeddingMode(setter: => this.type): this.type = {
+    if (getEmbedding) {
+      logger.warn("Embeddings enabled. This parameter has no effect.")
+      this
+    } else
+      setter
+  }
+
+  /** Set the number of threads to use during generation
+    *
+    * @group setParam
+    */
+  def setNThreads(nThreads: Int): this.type = {
+    set(this.nThreads, nThreads)
+  }
+
+  /** Set the number of threads to use during draft generation
+    *
+    * @group setParam
+    */
+  def setNThreadsDraft(nThreadsDraft: Int): this.type = {
+    checkEmbeddingMode { set(this.nThreadsDraft, nThreadsDraft) }
+  }
+
+  /** Set the number of threads to use during batch and prompt processing
+    *
+    * @group setParam
+    */
+  def setNThreadsBatch(nThreadsBatch: Int): this.type = {
+    checkEmbeddingMode { set(this.nThreadsBatch, nThreadsBatch) }
+  }
+
+  /** Set the number of threads to use during batch and prompt processing
+    *
+    * @group setParam
+    */
+  def setNThreadsBatchDraft(nThreadsBatchDraft: Int): this.type = {
+    checkEmbeddingMode { set(this.nThreadsBatchDraft, nThreadsBatchDraft) }
+  }
+
+  /** Set the size of the prompt context
+    *
+    * @group setParam
+    */
+  def setNCtx(nCtx: Int): this.type = {
+    set(this.nCtx, nCtx)
+  }
+
+  /** Set the logical batch size for prompt processing (must be >=32 to use BLAS)
+    *
+    * @group setParam
+    */
+  def setNBatch(nBatch: Int): this.type = {
+    set(this.nBatch, nBatch)
+  }
+
+  /** Set the physical batch size for prompt processing (must be >=32 to use BLAS)
+    *
+    * @group setParam
+    */
+  def setNUbatch(nUbatch: Int): this.type = {
+    set(this.nUbatch, nUbatch)
+  }
+
+  /** Set the number of tokens to draft for speculative decoding
+    *
+    * @group setParam
+    */
+  def setNDraft(nDraft: Int): this.type = {
+    checkEmbeddingMode { set(this.nDraft, nDraft) }
+  }
+
+  /** Set the maximal number of chunks to process
+    *
+    * @group setParam
+    */
+  def setNChunks(nChunks: Int): this.type = {
+    set(this.nChunks, nChunks)
+  }
+
+  /** Set the number of sequences to decode
+    *
+    * @group setParam
+    */
+  def setNSequences(nSequences: Int): this.type = {
+    set(this.nSequences, nSequences)
+  }
+
+  /** Set the speculative decoding split probability
+    *
+    * @group setParam
+    */
+  def setPSplit(pSplit: Float): this.type = {
+    checkEmbeddingMode { set(this.pSplit, pSplit) }
+  }
+
+  /** Set the number of layers to store in VRAM (-1 - use default)
+    *
+    * @group setParam
+    */
+  def setNGpuLayers(nGpuLayers: Int): this.type = {
+    set(this.nGpuLayers, nGpuLayers)
+  }
+
+  /** Set the number of layers to store in VRAM for the draft model (-1 - use default)
+    *
+    * @group setParam
+    */
+  def setNGpuLayersDraft(nGpuLayersDraft: Int): this.type = {
+    checkEmbeddingMode { set(this.nGpuLayersDraft, nGpuLayersDraft) }
+  }
+
+  /** Set how to split the model across GPUs
+    *
+    *   - NONE: No GPU split
+    * -LAYER: Split the model across GPUs by layer 2. ROW: Split the model across GPUs by rows
+    *
+    * @group setParam
+    */
+  def setGpuSplitMode(splitMode: String): this.type = {
+    set(this.gpuSplitMode, splitMode)
+  }
+
+  /** Set the GPU that is used for scratch and small tensors
+    *
+    * @group setParam
+    */
+  def setMainGpu(mainGpu: Int): this.type = {
+    set(this.mainGpu, mainGpu)
+  }
+
+  /** Set how split tensors should be distributed across GPUs
+    *
+    * @group setParam
+    */
+  def setTensorSplit(tensorSplit: Array[Double]): this.type = {
+    set(this.tensorSplit, tensorSplit)
+  }
+
+  /** Set the group-attention factor
+    *
+    * @group setParam
+    */
+  def setGrpAttnN(grpAttnN: Int): this.type = {
+    set(this.grpAttnN, grpAttnN)
+  }
+
+  /** Set the group-attention width
+    *
+    * @group setParam
+    */
+  def setGrpAttnW(grpAttnW: Int): this.type = {
+    set(this.grpAttnW, grpAttnW)
+  }
+
+  /** Set the RoPE base frequency, used by NTK-aware scaling
+    *
+    * @group setParam
+    */
+  def setRopeFreqBase(ropeFreqBase: Float): this.type = {
+    set(this.ropeFreqBase, ropeFreqBase)
+  }
+
+  /** Set the RoPE frequency scaling factor, expands context by a factor of 1/N
+    *
+    * @group setParam
+    */
+  def setRopeFreqScale(ropeFreqScale: Float): this.type = {
+    set(this.ropeFreqScale, ropeFreqScale)
+  }
+
+  /** Set the YaRN extrapolation mix factor
+    *
+    * @group setParam
+    */
+  def setYarnExtFactor(yarnExtFactor: Float): this.type = {
+    set(this.yarnExtFactor, yarnExtFactor)
+  }
+
+  /** Set the YaRN scale sqrt(t) or attention magnitude
+    *
+    * @group setParam
+    */
+  def setYarnAttnFactor(yarnAttnFactor: Float): this.type = {
+    set(this.yarnAttnFactor, yarnAttnFactor)
+  }
+
+  /** Set the YaRN low correction dim or beta
+    *
+    * @group setParam
+    */
+  def setYarnBetaFast(yarnBetaFast: Float): this.type = {
+    set(this.yarnBetaFast, yarnBetaFast)
+  }
+
+  /** Set the YaRN high correction dim or alpha
+    *
+    * @group setParam
+    */
+  def setYarnBetaSlow(yarnBetaSlow: Float): this.type = {
+    set(this.yarnBetaSlow, yarnBetaSlow)
+  }
+
+  /** Set the YaRN original context size of model
+    *
+    * @group setParam
+    */
+  def setYarnOrigCtx(yarnOrigCtx: Int): this.type = {
+    set(this.yarnOrigCtx, yarnOrigCtx)
+  }
+
+  /** Set the KV cache defragmentation threshold
+    *
+    * @group setParam
+    */
+  def setDefragmentationThreshold(defragThold: Float): this.type = {
+    set(this.defragmentationThreshold, defragThold)
+  }
+
+  /** Set optimization strategies that help on some NUMA systems (if available)
+    *
+    * Available Strategies:
+    *
+    *   - DISABLED: No NUMA optimizations
+    *   - DISTRIBUTE: spread execution evenly over all
+    *   - ISOLATE: only spawn threads on CPUs on the node that execution started on
+    *   - NUMA_CTL: use the CPU map provided by numactl
+    *   - MIRROR: Mirrors the model across NUMA nodes
+    *
+    * @group setParam
+    */
+  def setNumaStrategy(numa: String): this.type = {
+    val numaUpper = numa.toUpperCase
+    val numaStrategies = Array("DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR")
+    require(
+      numaStrategies.contains(numaUpper),
+      s"Invalid NUMA strategy: $numa. " +
+        s"Valid values are: ${numaStrategies.mkString(", ")}")
+    set(this.numaStrategy, numaUpper)
+  }
+
+  /** Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
+    *
+    *   - UNSPECIFIED: Don't use any scaling
+    *   - LINEAR: Linear scaling
+    *   - YARN: YaRN RoPE scaling
+    *
+    * @group setParam
+    */
+  def setRopeScalingType(ropeScalingType: String): this.type = {
+    set(this.ropeScalingType, ropeScalingType)
+  }
+
+  /** Set the pooling type for embeddings, use model default if unspecified
+    *
+    *   - 0 NONE: Don't use any pooling and return token embeddings (if the model supports it)
+    *   - 1 MEAN: Mean Pooling
+    *   - 2 CLS: Choose the CLS token
+    *   - 3 LAST: Choose the last token
+    *
+    * @group setParam
+    */
+  def setPoolingType(poolingType: String): this.type = {
+    val poolingTypeUpper = poolingType.toUpperCase
+    val poolingTypes = Array("NONE", "MEAN", "CLS", "LAST")
+    require(
+      poolingTypes.contains(poolingTypeUpper),
+      s"Invalid pooling type: $poolingType. " +
+        s"Valid values are: ${poolingTypes.mkString(", ")}")
+    set(this.poolingType, poolingTypeUpper)
+  }
+
+  /** Set the draft model for speculative decoding
+    *
+    * @group setParam
+    */
+  def setModelDraft(modelDraft: String): this.type = {
+    checkEmbeddingMode { set(this.modelDraft, modelDraft) }
+  }
+
+  /** Set path to static lookup cache to use for lookup decoding (not updated by generation)
+    *
+    * @group setParam
+    */
+  def setLookupCacheStaticFilePath(lookupCacheStaticFilePath: String): this.type = {
+    checkEmbeddingMode { set(this.lookupCacheStaticFilePath, lookupCacheStaticFilePath) }
+  }
+
+  /** Set path to dynamic lookup cache to use for lookup decoding (updated by generation)
+    *
+    * @group setParam
+    */
+  def setLookupCacheDynamicFilePath(lookupCacheDynamicFilePath: String): this.type = {
+    checkEmbeddingMode { set(this.lookupCacheDynamicFilePath, lookupCacheDynamicFilePath) }
+  }
+
+  /** Sets paths to lora adapters with user defined scale.
+    *
+    * @group setParam
+    */
+  def setLoraAdapters(loraAdapters: Map[String, Float]): this.type = {
+    set(this.loraAdapters, loraAdapters)
+  }
+
+  /** Sets paths to lora adapters with user defined scale. (PySpark Override)
+    *
+    * @group setParam
+    */
+  def setLoraAdapters(loraAdapters: java.util.HashMap[String, java.lang.Double]): this.type = {
+    val scalaLoraAdapters = loraAdapters.asScala.map { case (k, v) => k -> v.floatValue() }
+    set(this.loraAdapters, scalaLoraAdapters.toMap)
+  }
+
+  /** Whether to load model with embedding support
+    *
+    * @group setParam
+    */
+  def setEmbedding(embedding: Boolean): this.type = {
+    set(this.embedding, embedding)
+  }
+
+  /** Whether to enable Flash Attention
+    *
+    * @group setParam
+    */
+  def setFlashAttention(flashAttention: Boolean): this.type = {
+    set(this.flashAttention, flashAttention)
+  }
+
+  /** Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string
+    *
+    * @group setParam
+    */
+  def setInputPrefixBos(inputPrefixBos: Boolean): this.type = {
+    set(this.inputPrefixBos, inputPrefixBos)
+  }
+
+  /** Whether to use memory-map model (faster load but may increase pageouts if not using mlock)
+    *
+    * @group setParam
+    */
+  def setUseMmap(useMmap: Boolean): this.type = {
+    set(this.useMmap, useMmap)
+  }
+
+  /** Whether to force the system to keep model in RAM rather than swapping or compressing
+    *
+    * @group setParam
+    */
+  def setUseMlock(useMlock: Boolean): this.type = {
+    set(this.useMlock, useMlock)
+  }
+
+  /** Whether to disable KV offload
+    *
+    * @group setParam
+    */
+  def setNoKvOffload(noKvOffload: Boolean): this.type = {
+    set(this.noKvOffload, noKvOffload)
+  }
+
+  /** Set a system prompt to use
+    *
+    * @group setParam
+    */
+  def setSystemPrompt(systemPrompt: String): this.type = {
+    checkEmbeddingMode { set(this.systemPrompt, systemPrompt) }
+  }
+
+  /** The chat template to use
+    *
+    * @group setParam
+    */
+  def setChatTemplate(chatTemplate: String): this.type = {
+    checkEmbeddingMode { set(this.chatTemplate, chatTemplate) }
+  }
+
+  /** @group getParam */
+  def getNThreads: Int = $(nThreads)
+
+  /** @group getParam */
+  def getNThreadsDraft: Int = $(nThreadsDraft)
+
+  /** @group getParam */
+  def getNThreadsBatch: Int = $(nThreadsBatch)
+
+  /** @group getParam */
+  def getNThreadsBatchDraft: Int = $(nThreadsBatchDraft)
+
+  /** @group getParam */
+  def getNCtx: Int = $(nCtx)
+
+  /** @group getParam */
+  def getNBatch: Int = $(nBatch)
+
+  /** @group getParam */
+  def getNUbatch: Int = $(nUbatch)
+
+  /** @group getParam */
+  def getNDraft: Int = $(nDraft)
+
+  /** @group getParam */
+  def getNChunks: Int = $(nChunks)
+
+  /** @group getParam */
+  def getNSequences: Int = $(nSequences)
+
+  /** @group getParam */
+  def getPSplit: Float = $(pSplit)
+
+  /** @group getParam */
+  def getNGpuLayers: Int = $(nGpuLayers)
+
+  /** @group getParam */
+  def getNGpuLayersDraft: Int = $(nGpuLayersDraft)
+
+  /** @group getParam */
+  def getSplitMode: String = $(gpuSplitMode)
+
+  /** @group getParam */
+  def getMainGpu: Int = $(mainGpu)
+
+  /** @group getParam */
+  def getTensorSplit: Array[Double] = $(tensorSplit)
+
+  def getGrpAttnN: Int = $(grpAttnN)
+
+  /** @group getParam */
+  def getGrpAttnW: Int = $(grpAttnW)
+
+  /** @group getParam */
+  def getRopeFreqBase: Float = $(ropeFreqBase)
+
+  /** @group getParam */
+  def getRopeFreqScale: Float = $(ropeFreqScale)
+
+  /** @group getParam */
+  def getYarnExtFactor: Float = $(yarnExtFactor)
+
+  /** @group getParam */
+  def getYarnAttnFactor: Float = $(yarnAttnFactor)
+
+  /** @group getParam */
+  def getYarnBetaFast: Float = $(yarnBetaFast)
+
+  /** @group getParam */
+  def getYarnBetaSlow: Float = $(yarnBetaSlow)
+
+  /** @group getParam */
+  def getYarnOrigCtx: Int = $(yarnOrigCtx)
+
+  /** @group getParam */
+  def getDefragmentationThreshold: Float = $(defragmentationThreshold)
+
+  /** @group getParam */
+  def getNuma: String = $(numaStrategy)
+
+  /** @group getParam */
+  def getRopeScalingType: String = $(ropeScalingType)
+
+  /** @group getParam */
+  def getPoolingType: String = $(poolingType)
+
+  /** @group getParam */
+  def getModelDraft: String = $(modelDraft)
+
+  /** @group getParam */
+  def getLookupCacheStaticFilePath: String = $(lookupCacheStaticFilePath)
+
+  /** @group getParam */
+  def getLookupCacheDynamicFilePath: String = $(lookupCacheDynamicFilePath)
+
+  /** @group getParam */
+  def getLoraAdapters: Map[String, Float] = $$(loraAdapters)
+
+  /** @group getParam */
+  def getEmbedding: Boolean = $(embedding)
+
+  /** @group getParam */
+  def getFlashAttention: Boolean = $(flashAttention)
+
+  /** @group getParam */
+  def getInputPrefixBos: Boolean = $(inputPrefixBos)
+
+  /** @group getParam */
+  def getUseMmap: Boolean = $(useMmap)
+
+  /** @group getParam */
+  def getUseMlock: Boolean = $(useMlock)
+
+  /** @group getParam */
+  def getNoKvOffload: Boolean = $(noKvOffload)
+
+  /** @group getParam */
+  def getSystemPrompt: String = $(systemPrompt)
+
+  /** @group getParam */
+  def getChatTemplate: String = $(chatTemplate)
+
+  // ---------------- METADATA ----------------
+  val metadata =
+    new Param[String](this, "metadata", "Set the metadata for the model").setProtected()
+
+  /** Set the metadata for the model
+    * @group setParam
+    */
+  def setMetadata(metadata: String): this.type = { set(this.metadata, metadata) }
+
+  /** Get the metadata for the model
+    * @group getParam
+    */
+  def getMetadata: String = $(metadata)
+
+  def getMetadataMap: Map[String, String] = {
+    val metadataJsonString = getMetadata
+    if (metadataJsonString.isEmpty) Map.empty
+    else {
+      implicit val formats: DefaultFormats.type = DefaultFormats
+      JsonMethods.parse(metadataJsonString).extract[Map[String, String]]
+    }
+  }
+
+  protected def getModelParameters: ModelParameters = {
+    val modelParameters = new ModelParameters().setContinuousBatching(true) // Always enabled
+
+    if (isDefined(chatTemplate)) modelParameters.setChatTemplate(getChatTemplate)
+    if (isDefined(defragmentationThreshold))
+      modelParameters.setDefragmentationThreshold(getDefragmentationThreshold)
+    if (isDefined(embedding)) modelParameters.setEmbedding(getEmbedding)
+    if (isDefined(flashAttention)) modelParameters.setFlashAttention(getFlashAttention)
+    if (isDefined(gpuSplitMode))
+      modelParameters.setSplitMode(GpuSplitMode.valueOf(getSplitMode))
+    if (isDefined(grpAttnN)) modelParameters.setGrpAttnN(getGrpAttnN)
+    if (isDefined(grpAttnW)) modelParameters.setGrpAttnN(getGrpAttnW)
+    if (isDefined(inputPrefixBos)) modelParameters.setInputPrefixBos(getInputPrefixBos)
+    if (isDefined(lookupCacheDynamicFilePath))
+      modelParameters.setLookupCacheDynamicFilePath(getLookupCacheDynamicFilePath)
+    if (isDefined(lookupCacheStaticFilePath))
+      modelParameters.setLookupCacheStaticFilePath(getLookupCacheStaticFilePath)
+    if (isDefined(mainGpu)) modelParameters.setMainGpu(getMainGpu)
+    if (isDefined(modelDraft)) modelParameters.setModelDraft(getModelDraft)
+    if (isDefined(nBatch)) modelParameters.setNBatch(getNBatch)
+    if (isDefined(nChunks)) modelParameters.setNChunks(getNChunks)
+    if (isDefined(nCtx)) modelParameters.setNCtx(getNCtx)
+    if (isDefined(nDraft)) modelParameters.setNDraft(getNDraft)
+    if (isDefined(nGpuLayers)) modelParameters.setNGpuLayers(getNGpuLayers)
+    if (isDefined(nGpuLayersDraft)) modelParameters.setNGpuLayersDraft(getNGpuLayersDraft)
+    if (isDefined(nSequences)) modelParameters.setNSequences(getNSequences)
+    if (isDefined(nThreads)) modelParameters.setNThreads(getNThreads)
+    if (isDefined(nThreadsBatch)) modelParameters.setNThreadsBatch(getNThreadsBatch)
+    if (isDefined(nThreadsBatchDraft))
+      modelParameters.setNThreadsBatchDraft(getNThreadsBatchDraft)
+    if (isDefined(nThreadsDraft)) modelParameters.setNThreadsDraft(getNThreadsDraft)
+    if (isDefined(nUbatch)) modelParameters.setNUbatch(getNUbatch)
+    if (isDefined(noKvOffload)) modelParameters.setNoKvOffload(getNoKvOffload)
+    if (isDefined(numaStrategy)) modelParameters.setNuma(NumaStrategy.valueOf(getNuma))
+    if (isDefined(pSplit)) modelParameters.setPSplit(getPSplit)
+    if (isDefined(poolingType))
+      modelParameters.setPoolingType(PoolingType.valueOf(getPoolingType))
+    if (isDefined(ropeFreqBase)) modelParameters.setRopeFreqBase(getRopeFreqBase)
+    if (isDefined(ropeFreqScale)) modelParameters.setRopeFreqScale(getRopeFreqScale)
+    if (isDefined(ropeScalingType))
+      modelParameters.setRopeScalingType(RopeScalingType.valueOf(getRopeScalingType))
+    if (isDefined(systemPrompt)) modelParameters.setSystemPrompt(getSystemPrompt)
+    if (isDefined(tensorSplit)) modelParameters.setTensorSplit(getTensorSplit.map(_.toFloat))
+    if (isDefined(useMlock)) modelParameters.setUseMlock(getUseMlock)
+    if (isDefined(useMmap)) modelParameters.setUseMmap(getUseMmap)
+    if (isDefined(yarnAttnFactor)) modelParameters.setYarnAttnFactor(getYarnAttnFactor)
+    if (isDefined(yarnBetaFast)) modelParameters.setYarnBetaFast(getYarnBetaFast)
+    if (isDefined(yarnBetaSlow)) modelParameters.setYarnBetaSlow(getYarnBetaSlow)
+    if (isDefined(yarnExtFactor)) modelParameters.setYarnExtFactor(getYarnExtFactor)
+    if (isDefined(yarnOrigCtx)) modelParameters.setYarnOrigCtx(getYarnOrigCtx)
+    if (loraAdapters.isSet) {
+      val loraAdaptersMap: mutable.Map[String, java.lang.Float] =
+        mutable.Map(getLoraAdapters.map { case (key, value) =>
+          (key, float2Float(value))
+        }.toSeq: _*)
+      modelParameters.setLoraAdapters(loraAdaptersMap.asJava)
+    } // Need to convert to mutable map first
+
+    modelParameters
+  }
+
+  // ---------------- GPU SUPPORT ----------------
+  // Values for automatic GPU support
+  protected val defaultGpuLayers = 1000
+  protected val defaultMainGpu = 0
+
+  // Entrypoint for models. Automatically set GPU support if detected.
+  protected def setGpuSupportIfAvailable(spark: SparkSession): this.type = {
+    val usingGPUJar: Boolean = spark.sparkContext.listJars.exists(_.contains("spark-nlp-gpu"))
+    if (usingGPUJar) {
+      logger.info("Using GPU jar. Offloading all layers to GPU.")
+      setMainGpu(defaultMainGpu)
+      setNGpuLayers(defaultGpuLayers)
+    }
+    this
+  }
+}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppProperties.scala b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppProperties.scala
deleted file mode 100644
index e6d832eef9a79f..00000000000000
--- a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppProperties.scala
+++ /dev/null
@@ -1,1292 +0,0 @@
-package com.johnsnowlabs.nlp
-
-import com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel
-import com.johnsnowlabs.nlp.llama.args._
-import com.johnsnowlabs.nlp.llama.{InferenceParameters, ModelParameters}
-import com.johnsnowlabs.nlp.serialization.StructFeature
-import org.apache.spark.ml.param._
-import org.slf4j.LoggerFactory
-
-import scala.collection.mutable
-import scala.jdk.CollectionConverters._
-
-/** Contains settable parameters for the [[AutoGGUFModel]].
-  *
-  * @groupname param Parameters
-  * @groupname setParam Parameter setters
-  * @groupname getParam Parameter getters
-  * @groupprio setParam  1
-  * @groupprio getParam  2
-  * @groupprio param  3
-  * @groupdesc param
-  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
-  *   parameter values through setters and getters, respectively.
-  */
-trait HasLlamaCppProperties {
-  this: ParamsAndFeaturesWritable with HasProtectedParams =>
-  val logger = LoggerFactory.getLogger(this.getClass)
-  // ---------------- MODEL PARAMETERS ----------------
-  /** @group param */
-  val nThreads =
-    new IntParam(this, "nThreads", "Set the number of threads to use during generation")
-
-  /** @group param */
-  val nThreadsDraft = new IntParam(
-    this,
-    "nThreadsDraft",
-    "Set the number of threads to use during draft generation")
-
-  /** @group param */
-  val nThreadsBatch = new IntParam(
-    this,
-    "nThreadsBatch",
-    "Set the number of threads to use during batch and prompt processing")
-
-  /** @group param */
-  val nThreadsBatchDraft = new IntParam(
-    this,
-    "nThreadsBatchDraft",
-    "Set the number of threads to use during batch and prompt processing")
-
-  /** @group param */
-  val nCtx = new IntParam(this, "nCtx", "Set the size of the prompt context")
-
-  /** @group param */
-  val nBatch = new IntParam(
-    this,
-    "nBatch",
-    "Set the logical batch size for prompt processing (must be >=32 to use BLAS)")
-
-  /** @group param */
-  val nUbatch = new IntParam(
-    this,
-    "nUbatch",
-    "Set the physical batch size for prompt processing (must be >=32 to use BLAS)")
-
-  /** @group param */
-  val nDraft =
-    new IntParam(this, "nDraft", "Set the number of tokens to draft for speculative decoding")
-
-  /** @group param */
-  val nChunks = new IntParam(this, "nChunks", "Set the maximal number of chunks to process")
-
-  /** @group param */
-  val nSequences =
-    new IntParam(this, "nSequences", "Set the number of sequences to decode")
-
-  /** @group param */
-  val pSplit = new FloatParam(this, "pSplit", "Set the speculative decoding split probability")
-
-  /** @group param */
-  val nGpuLayers = new IntParam(
-    this,
-    "nGpuLayers",
-    "Set the number of layers to store in VRAM (-1 - use default)")
-
-  /** @group param */
-  val nGpuLayersDraft = new IntParam(
-    this,
-    "nGpuLayersDraft",
-    "Set the number of layers to store in VRAM for the draft model (-1 - use default)")
-
-  /** Set how to split the model across GPUs
-    *
-    *   - NONE: No GPU split
-    *   - LAYER: Split the model across GPUs by layer
-    *   - ROW: Split the model across GPUs by rows
-    *
-    * @group param
-    */
-  val gpuSplitMode =
-    new Param[String](this, "gpuSplitMode", "Set how to split the model across GPUs")
-
-  /** @group param */
-  val mainGpu =
-    new IntParam(this, "mainGpu", "Set the main GPU that is used for scratch and small tensors.")
-
-  /** @group param */
-  val tensorSplit = new DoubleArrayParam(
-    this,
-    "tensorSplit",
-    "Set how split tensors should be distributed across GPUs")
-
-  /** @group param */
-  val grpAttnN = new IntParam(this, "grpAttnN", "Set the group-attention factor")
-
-  /** @group param */
-  val grpAttnW = new IntParam(this, "grpAttnW", "Set the group-attention width")
-
-  /** @group param */
-  val ropeFreqBase =
-    new FloatParam(this, "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling")
-
-  /** @group param */
-  val ropeFreqScale = new FloatParam(
-    this,
-    "ropeFreqScale",
-    "Set the RoPE frequency scaling factor, expands context by a factor of 1/N")
-
-  /** @group param */
-  val yarnExtFactor =
-    new FloatParam(this, "yarnExtFactor", "Set the YaRN extrapolation mix factor")
-
-  /** @group param */
-  val yarnAttnFactor =
-    new FloatParam(this, "yarnAttnFactor", "Set the YaRN scale sqrt(t) or attention magnitude")
-
-  /** @group param */
-  val yarnBetaFast =
-    new FloatParam(this, "yarnBetaFast", "Set the YaRN low correction dim or beta")
-
-  /** @group param */
-  val yarnBetaSlow =
-    new FloatParam(this, "yarnBetaSlow", "Set the YaRN high correction dim or alpha")
-
-  /** @group param */
-  val yarnOrigCtx =
-    new IntParam(this, "yarnOrigCtx", "Set the YaRN original context size of model")
-
-  /** @group param */
-  val defragmentationThreshold =
-    new FloatParam(this, "defragmentationThreshold", "Set the KV cache defragmentation threshold")
-
-  /** Set optimization strategies that help on some NUMA systems (if available)
-    *
-    * Available Strategies:
-    *
-    *   - DISABLED: No NUMA optimizations
-    *   - DISTRIBUTE: Spread execution evenly over all
-    *   - ISOLATE: Only spawn threads on CPUs on the node that execution started on
-    *   - NUMA_CTL: Use the CPU map provided by numactl
-    *   - MIRROR: Mirrors the model across NUMA nodes
-    *
-    * @group param
-    */
-  val numaStrategy = new Param[String](
-    this,
-    "numaStrategy",
-    "Set optimization strategies that help on some NUMA systems (if available)")
-
-  /** Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
-    *
-    *   - UNSPECIFIED: Don't use any scaling
-    *   - LINEAR: Linear scaling
-    *   - YARN: YaRN RoPE scaling
-    * @group param
-    */
-  val ropeScalingType = new Param[String](
-    this,
-    "ropeScalingType",
-    "Set the RoPE frequency scaling method, defaults to linear unless specified by the model")
-
-  /** Set the pooling type for embeddings, use model default if unspecified
-    *
-    *   - 0 UNSPECIFIED: Don't use any pooling
-    *   - 1 MEAN: Mean Pooling
-    *   - 2 CLS: CLS Pooling
-    *
-    * @group param
-    */
-  val poolingType = new Param[String](
-    this,
-    "poolingType",
-    "Set the pooling type for embeddings, use model default if unspecified")
-  //  model = new Param[String](this, "model", "Set the model file path to load")
-  /** @group param */
-  val modelDraft =
-    new Param[String](this, "modelDraft", "Set the draft model for speculative decoding")
-
-  //  modelAlias = new Param[String](this, "modelAlias", "Set a model alias")
-  /** @group param */
-  val lookupCacheStaticFilePath = new Param[String](
-    this,
-    "lookupCacheStaticFilePath",
-    "Set path to static lookup cache to use for lookup decoding (not updated by generation)")
-
-  /** @group param */
-  val lookupCacheDynamicFilePath = new Param[String](
-    this,
-    "lookupCacheDynamicFilePath",
-    "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)")
-
-  /** @group param */
-  val loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters")
-
-  val embedding =
-    new BooleanParam(this, "embedding", "Whether to load model with embedding support")
-
-  /** @group param */
-  val flashAttention =
-    new BooleanParam(this, "flashAttention", "Whether to enable Flash Attention")
-
-  /** @group param */
-  val inputPrefixBos = new BooleanParam(
-    this,
-    "inputPrefixBos",
-    "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string")
-
-  /** @group param */
-  val useMmap = new BooleanParam(
-    this,
-    "useMmap",
-    "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)")
-
-  /** @group param */
-  val useMlock = new BooleanParam(
-    this,
-    "useMlock",
-    "Whether to force the system to keep model in RAM rather than swapping or compressing")
-
-  /** @group param */
-  val noKvOffload = new BooleanParam(this, "noKvOffload", "Whether to disable KV offload")
-
-  /** @group param */
-  val systemPrompt = new Param[String](this, "systemPrompt", "Set a system prompt to use")
-
-  /** @group param */
-  val chatTemplate =
-    new Param[String](this, "chatTemplate", "The chat template to use")
-
-  /** Set the number of threads to use during generation
-    *
-    * @group setParam
-    */
-  def setNThreads(nThreads: Int): this.type = { set(this.nThreads, nThreads) }
-
-  /** Set the number of threads to use during draft generation
-    *
-    * @group setParam
-    */
-  def setNThreadsDraft(nThreadsDraft: Int): this.type = { set(this.nThreadsDraft, nThreadsDraft) }
-
-  /** Set the number of threads to use during batch and prompt processing
-    *
-    * @group setParam
-    */
-  def setNThreadsBatch(nThreadsBatch: Int): this.type = { set(this.nThreadsBatch, nThreadsBatch) }
-
-  /** Set the number of threads to use during batch and prompt processing
-    *
-    * @group setParam
-    */
-  def setNThreadsBatchDraft(nThreadsBatchDraft: Int): this.type = {
-    set(this.nThreadsBatchDraft, nThreadsBatchDraft)
-  }
-
-  /** Set the size of the prompt context
-    *
-    * @group setParam
-    */
-  def setNCtx(nCtx: Int): this.type = { set(this.nCtx, nCtx) }
-
-  /** Set the logical batch size for prompt processing (must be >=32 to use BLAS)
-    *
-    * @group setParam
-    */
-  def setNBatch(nBatch: Int): this.type = { set(this.nBatch, nBatch) }
-
-  /** Set the physical batch size for prompt processing (must be >=32 to use BLAS)
-    *
-    * @group setParam
-    */
-  def setNUbatch(nUbatch: Int): this.type = { set(this.nUbatch, nUbatch) }
-
-  /** Set the number of tokens to draft for speculative decoding
-    *
-    * @group setParam
-    */
-  def setNDraft(nDraft: Int): this.type = { set(this.nDraft, nDraft) }
-
-  /** Set the maximal number of chunks to process
-    *
-    * @group setParam
-    */
-  def setNChunks(nChunks: Int): this.type = { set(this.nChunks, nChunks) }
-
-  /** Set the number of sequences to decode
-    *
-    * @group setParam
-    */
-  def setNSequences(nSequences: Int): this.type = { set(this.nSequences, nSequences) }
-
-  /** Set the speculative decoding split probability
-    *
-    * @group setParam
-    */
-  def setPSplit(pSplit: Float): this.type = { set(this.pSplit, pSplit) }
-
-  /** Set the number of layers to store in VRAM (-1 - use default)
-    *
-    * @group setParam
-    */
-  def setNGpuLayers(nGpuLayers: Int): this.type = { set(this.nGpuLayers, nGpuLayers) }
-
-  /** Set the number of layers to store in VRAM for the draft model (-1 - use default)
-    *
-    * @group setParam
-    */
-  def setNGpuLayersDraft(nGpuLayersDraft: Int): this.type = {
-    set(this.nGpuLayersDraft, nGpuLayersDraft)
-  }
-
-  /** Set how to split the model across GPUs
-    *
-    *   - NONE: No GPU split
-    * -LAYER: Split the model across GPUs by layer 2. ROW: Split the model across GPUs by rows
-    *
-    * @group setParam
-    */
-  def setGpuSplitMode(splitMode: String): this.type = { set(this.gpuSplitMode, splitMode) }
-
-  /** Set the GPU that is used for scratch and small tensors
-    *
-    * @group setParam
-    */
-  def setMainGpu(mainGpu: Int): this.type = { set(this.mainGpu, mainGpu) }
-
-  /** Set how split tensors should be distributed across GPUs
-    *
-    * @group setParam
-    */
-  def setTensorSplit(tensorSplit: Array[Double]): this.type = {
-    set(this.tensorSplit, tensorSplit)
-  }
-
-  /** Set the group-attention factor
-    *
-    * @group setParam
-    */
-  def setGrpAttnN(grpAttnN: Int): this.type = { set(this.grpAttnN, grpAttnN) }
-
-  /** Set the group-attention width
-    *
-    * @group setParam
-    */
-  def setGrpAttnW(grpAttnW: Int): this.type = { set(this.grpAttnW, grpAttnW) }
-
-  /** Set the RoPE base frequency, used by NTK-aware scaling
-    *
-    * @group setParam
-    */
-  def setRopeFreqBase(ropeFreqBase: Float): this.type = { set(this.ropeFreqBase, ropeFreqBase) }
-
-  /** Set the RoPE frequency scaling factor, expands context by a factor of 1/N
-    *
-    * @group setParam
-    */
-  def setRopeFreqScale(ropeFreqScale: Float): this.type = {
-    set(this.ropeFreqScale, ropeFreqScale)
-  }
-
-  /** Set the YaRN extrapolation mix factor
-    *
-    * @group setParam
-    */
-  def setYarnExtFactor(yarnExtFactor: Float): this.type = {
-    set(this.yarnExtFactor, yarnExtFactor)
-  }
-
-  /** Set the YaRN scale sqrt(t) or attention magnitude
-    *
-    * @group setParam
-    */
-  def setYarnAttnFactor(yarnAttnFactor: Float): this.type = {
-    set(this.yarnAttnFactor, yarnAttnFactor)
-  }
-
-  /** Set the YaRN low correction dim or beta
-    *
-    * @group setParam
-    */
-  def setYarnBetaFast(yarnBetaFast: Float): this.type = { set(this.yarnBetaFast, yarnBetaFast) }
-
-  /** Set the YaRN high correction dim or alpha
-    *
-    * @group setParam
-    */
-  def setYarnBetaSlow(yarnBetaSlow: Float): this.type = { set(this.yarnBetaSlow, yarnBetaSlow) }
-
-  /** Set the YaRN original context size of model
-    *
-    * @group setParam
-    */
-  def setYarnOrigCtx(yarnOrigCtx: Int): this.type = { set(this.yarnOrigCtx, yarnOrigCtx) }
-
-  /** Set the KV cache defragmentation threshold
-    *
-    * @group setParam
-    */
-  def setDefragmentationThreshold(defragThold: Float): this.type = {
-    set(this.defragmentationThreshold, defragThold)
-  }
-
-  /** Set optimization strategies that help on some NUMA systems (if available)
-    *
-    * Available Strategies:
-    *
-    *   - DISABLED: No NUMA optimizations
-    *   - DISTRIBUTE: spread execution evenly over all
-    *   - ISOLATE: only spawn threads on CPUs on the node that execution started on
-    *   - NUMA_CTL: use the CPU map provided by numactl
-    *   - MIRROR: Mirrors the model across NUMA nodes
-    *
-    * @group setParam
-    */
-  def setNumaStrategy(numa: String): this.type = { set(this.numaStrategy, numa) }
-
-  /** Set the RoPE frequency scaling method, defaults to linear unless specified by the model.
-    *
-    *   - UNSPECIFIED: Don't use any scaling
-    *   - LINEAR: Linear scaling
-    *   - YARN: YaRN RoPE scaling
-    * @group setParam
-    */
-  def setRopeScalingType(ropeScalingType: String): this.type = {
-    set(this.ropeScalingType, ropeScalingType)
-  }
-
-  /** Set the pooling type for embeddings, use model default if unspecified
-    *
-    *   - UNSPECIFIED: Don't use any pooling
-    *   - MEAN: Mean Pooling
-    *   - CLS: CLS Pooling
-    *
-    * @group setParam
-    */
-  def setPoolingType(poolingType: String): this.type = { set(this.poolingType, poolingType) }
-
-  /** Set the draft model for speculative decoding
-    *
-    * @group setParam
-    */
-  def setModelDraft(modelDraft: String): this.type = { set(this.modelDraft, modelDraft) }
-
-  /** Set a model alias
-    *
-    * @group setParam
-    */
-  def setLookupCacheStaticFilePath(lookupCacheStaticFilePath: String): this.type = {
-    set(this.lookupCacheStaticFilePath, lookupCacheStaticFilePath)
-  }
-
-  /** Set a model alias
-    *
-    * @group setParam
-    */
-  def setLookupCacheDynamicFilePath(lookupCacheDynamicFilePath: String): this.type = {
-    set(this.lookupCacheDynamicFilePath, lookupCacheDynamicFilePath)
-  }
-
-  /** Sets paths to lora adapters with user defined scale.
-    *
-    * @group setParam
-    */
-  def setLoraAdapters(loraAdapters: Map[String, Float]): this.type = {
-    set(this.loraAdapters, loraAdapters)
-  }
-
-  /** Sets paths to lora adapters with user defined scale. (PySpark Override)
-    *
-    * @group setParam
-    */
-  def setLoraAdapters(loraAdapters: java.util.HashMap[String, java.lang.Double]): this.type = {
-    val scalaLoraAdapters = loraAdapters.asScala.map { case (k, v) => k -> v.floatValue() }
-    set(this.loraAdapters, scalaLoraAdapters.toMap)
-  }
-
-  /** Whether to load model with embedding support
-    *
-    * @group setParam
-    */
-  def setEmbedding(embedding: Boolean): this.type = { set(this.embedding, embedding) }
-
-  /** Whether to enable Flash Attention
-    *
-    * @group setParam
-    */
-  def setFlashAttention(flashAttention: Boolean): this.type = {
-    set(this.flashAttention, flashAttention)
-  }
-
-  /** Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string
-    *
-    * @group setParam
-    */
-  def setInputPrefixBos(inputPrefixBos: Boolean): this.type = {
-    set(this.inputPrefixBos, inputPrefixBos)
-  }
-
-  /** Whether to use memory-map model (faster load but may increase pageouts if not using mlock)
-    *
-    * @group setParam
-    */
-  def setUseMmap(useMmap: Boolean): this.type = { set(this.useMmap, useMmap) }
-
-  /** Whether to force the system to keep model in RAM rather than swapping or compressing
-    *
-    * @group setParam
-    */
-  def setUseMlock(useMlock: Boolean): this.type = { set(this.useMlock, useMlock) }
-
-  /** Whether to disable KV offload
-    *
-    * @group setParam
-    */
-  def setNoKvOffload(noKvOffload: Boolean): this.type = { set(this.noKvOffload, noKvOffload) }
-
-  /** Set a system prompt to use
-    *
-    * @group setParam
-    */
-  def setSystemPrompt(systemPrompt: String): this.type = { set(this.systemPrompt, systemPrompt) }
-
-  /** The chat template to use
-    *
-    * @group setParam
-    */
-  def setChatTemplate(chatTemplate: String): this.type = { set(this.chatTemplate, chatTemplate) }
-
-  // ---------------- GETTERS ----------------
-  /** @group getParam */
-  def getNThreads: Int = $(nThreads)
-
-  /** @group getParam */
-  def getNThreadsDraft: Int = $(nThreadsDraft)
-
-  /** @group getParam */
-  def getNThreadsBatch: Int = $(nThreadsBatch)
-
-  /** @group getParam */
-  def getNThreadsBatchDraft: Int = $(nThreadsBatchDraft)
-
-  /** @group getParam */
-  def getNCtx: Int = $(nCtx)
-
-  /** @group getParam */
-  def getNBatch: Int = $(nBatch)
-
-  /** @group getParam */
-  def getNUbatch: Int = $(nUbatch)
-
-  /** @group getParam */
-  def getNDraft: Int = $(nDraft)
-
-  /** @group getParam */
-  def getNChunks: Int = $(nChunks)
-
-  /** @group getParam */
-  def getNSequences: Int = $(nSequences)
-
-  /** @group getParam */
-  def getPSplit: Float = $(pSplit)
-
-  /** @group getParam */
-  def getNGpuLayers: Int = $(nGpuLayers)
-
-  /** @group getParam */
-  def getNGpuLayersDraft: Int = $(nGpuLayersDraft)
-
-  /** @group getParam */
-  def getSplitMode: String = $(gpuSplitMode)
-
-  /** @group getParam */
-  def getMainGpu: Int = $(mainGpu)
-
-  /** @group getParam */
-  def getTensorSplit: Array[Double] = $(tensorSplit)
-
-  def getGrpAttnN: Int = $(grpAttnN)
-
-  /** @group getParam */
-  def getGrpAttnW: Int = $(grpAttnW)
-
-  /** @group getParam */
-  def getRopeFreqBase: Float = $(ropeFreqBase)
-
-  /** @group getParam */
-  def getRopeFreqScale: Float = $(ropeFreqScale)
-
-  /** @group getParam */
-  def getYarnExtFactor: Float = $(yarnExtFactor)
-
-  /** @group getParam */
-  def getYarnAttnFactor: Float = $(yarnAttnFactor)
-
-  /** @group getParam */
-  def getYarnBetaFast: Float = $(yarnBetaFast)
-
-  /** @group getParam */
-  def getYarnBetaSlow: Float = $(yarnBetaSlow)
-
-  /** @group getParam */
-  def getYarnOrigCtx: Int = $(yarnOrigCtx)
-
-  /** @group getParam */
-  def getDefragmentationThreshold: Float = $(defragmentationThreshold)
-
-  /** @group getParam */
-  def getNuma: String = $(numaStrategy)
-
-  /** @group getParam */
-  def getRopeScalingType: String = $(ropeScalingType)
-
-  /** @group getParam */
-  def getPoolingType: String = $(poolingType)
-
-  /** @group getParam */
-  def getModelDraft: String = $(modelDraft)
-
-  /** @group getParam */
-  def getLookupCacheStaticFilePath: String = $(lookupCacheStaticFilePath)
-
-  /** @group getParam */
-  def getLookupCacheDynamicFilePath: String = $(lookupCacheDynamicFilePath)
-
-  /** @group getParam */
-  def getLoraAdapters: Map[String, Float] = $$(loraAdapters)
-
-  /** @group getParam */
-  def getEmbedding: Boolean = $(embedding)
-
-  /** @group getParam */
-  def getFlashAttention: Boolean = $(flashAttention)
-
-  /** @group getParam */
-  def getInputPrefixBos: Boolean = $(inputPrefixBos)
-
-  /** @group getParam */
-  def getUseMmap: Boolean = $(useMmap)
-
-  /** @group getParam */
-  def getUseMlock: Boolean = $(useMlock)
-
-  /** @group getParam */
-  def getNoKvOffload: Boolean = $(noKvOffload)
-
-  /** @group getParam */
-  def getSystemPrompt: String = $(systemPrompt)
-
-  /** @group getParam */
-  def getChatTemplate: String = $(chatTemplate)
-
-  // ---------------- INFERENCE PARAMETERS ----------------
-  /** @group param */
-  val inputPrefix =
-    new Param[String](this, "inputPrefix", "Set the prompt to start generation with")
-
-  /** @group param */
-  val inputSuffix =
-    new Param[String](this, "inputSuffix", "Set a suffix for infilling")
-
-  /** @group param */
-  val cachePrompt = new BooleanParam(
-    this,
-    "cachePrompt",
-    "Whether to remember the prompt to avoid reprocessing it")
-
-  /** @group param */
-  val nPredict = new IntParam(this, "nPredict", "Set the number of tokens to predict")
-
-  /** @group param */
-  val topK = new IntParam(this, "topK", "Set top-k sampling")
-
-  /** @group param */
-  val topP = new FloatParam(this, "topP", "Set top-p sampling")
-
-  /** @group param */
-  val minP = new FloatParam(this, "minP", "Set min-p sampling")
-
-  /** @group param */
-  val tfsZ = new FloatParam(this, "tfsZ", "Set tail free sampling, parameter z")
-
-  /** @group param */
-  val typicalP = new FloatParam(this, "typicalP", "Set locally typical sampling, parameter p")
-
-  /** @group param */
-  val temperature = new FloatParam(this, "temperature", "Set the temperature")
-
-  /** @group param */
-  val dynamicTemperatureRange =
-    new FloatParam(this, "dynatempRange", "Set the dynamic temperature range")
-
-  /** @group param */
-  val dynamicTemperatureExponent =
-    new FloatParam(this, "dynatempExponent", "Set the dynamic temperature exponent")
-
-  /** @group param */
-  val repeatLastN =
-    new IntParam(this, "repeatLastN", "Set the last n tokens to consider for penalties")
-
-  /** @group param */
-  val repeatPenalty =
-    new FloatParam(this, "repeatPenalty", "Set the penalty of repeated sequences of tokens")
-
-  /** @group param */
-  val frequencyPenalty =
-    new FloatParam(this, "frequencyPenalty", "Set the repetition alpha frequency penalty")
-
-  /** @group param */
-  val presencePenalty =
-    new FloatParam(this, "presencePenalty", "Set the repetition alpha presence penalty")
-
-  /** @group param */
-  val miroStat = new Param[String](this, "miroStat", "Set MiroStat sampling strategies.")
-
-  /** @group param */
-  val miroStatTau =
-    new FloatParam(this, "mirostatTau", "Set the MiroStat target entropy, parameter tau")
-
-  /** @group param */
-  val miroStatEta =
-    new FloatParam(this, "mirostatEta", "Set the MiroStat learning rate, parameter eta")
-
-  /** @group param */
-  val penalizeNl = new BooleanParam(this, "penalizeNl", "Whether to penalize newline tokens")
-
-  /** @group param */
-  val nKeep =
-    new IntParam(this, "nKeep", "Set the number of tokens to keep from the initial prompt")
-
-  /** @group param */
-  val seed = new IntParam(this, "seed", "Set the RNG seed")
-
-  /** @group param */
-  val nProbs = new IntParam(
-    this,
-    "nProbs",
-    "Set the amount top tokens probabilities to output if greater than 0.")
-
-  /** @group param */
-  val minKeep = new IntParam(
-    this,
-    "minKeep",
-    "Set the amount of tokens the samplers should return at least (0 = disabled)")
-
-  /** @group param */
-  val grammar =
-    new Param[String](this, "grammar", "Set BNF-like grammar to constrain generations")
-
-  /** @group param */
-  val penaltyPrompt = new Param[String](
-    this,
-    "penaltyPrompt",
-    "Override which part of the prompt is penalized for repetition.")
-
-  /** @group param */
-  val ignoreEos = new BooleanParam(
-    this,
-    "ignoreEos",
-    "Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)")
-
-  // Modify the likelihood of tokens appearing in the completion by their id.
-  val tokenIdBias: StructFeature[Map[Int, Float]] =
-    new StructFeature[Map[Int, Float]](this, "tokenIdBias")
-
-  // Modify the likelihood of tokens appearing in the completion by their string.
-  /** @group param */
-  val tokenBias: StructFeature[Map[String, Float]] =
-    new StructFeature[Map[String, Float]](this, "tokenBias")
-
-  /** @group param */
-  val disableTokenIds =
-    new IntArrayParam(this, "disableTokenIds", "Set the token ids to disable in the completion")
-
-  /** @group param */
-  val stopStrings = new StringArrayParam(
-    this,
-    "stopStrings",
-    "Set strings upon seeing which token generation is stopped")
-
-  /** @group param */
-  val samplers = new StringArrayParam(
-    this,
-    "samplers",
-    "Set which samplers to use for token generation in the given order")
-
-  /** @group param */
-  val useChatTemplate = new BooleanParam(
-    this,
-    "useChatTemplate",
-    "Set whether or not generate should apply a chat template")
-
-  /** Set the prompt to start generation with
-    *
-    * @group setParam
-    */
-  def setInputPrefix(inputPrefix: String): this.type = { set(this.inputPrefix, inputPrefix) }
-
-  /** Set a suffix for infilling
-    *
-    * @group setParam
-    */
-  def setInputSuffix(inputSuffix: String): this.type = { set(this.inputSuffix, inputSuffix) }
-
-  /** Whether to remember the prompt to avoid reprocessing it
-    *
-    * @group setParam
-    */
-  def setCachePrompt(cachePrompt: Boolean): this.type = { set(this.cachePrompt, cachePrompt) }
-
-  /** Set the number of tokens to predict
-    *
-    * @group setParam
-    */
-  def setNPredict(nPredict: Int): this.type = { set(this.nPredict, nPredict) }
-
-  /** Set top-k sampling
-    *
-    * @group setParam
-    */
-  def setTopK(topK: Int): this.type = { set(this.topK, topK) }
-
-  /** Set top-p sampling
-    *
-    * @group setParam
-    */
-  def setTopP(topP: Float): this.type = { set(this.topP, topP) }
-
-  /** Set min-p sampling
-    *
-    * @group setParam
-    */
-  def setMinP(minP: Float): this.type = { set(this.minP, minP) }
-
-  /** Set tail free sampling, parameter z
-    * @group setParam
-    */
-  def setTfsZ(tfsZ: Float): this.type = { set(this.tfsZ, tfsZ) }
-
-  /** Set locally typical sampling, parameter p
-    *
-    * @group setParam
-    */
-  def setTypicalP(typicalP: Float): this.type = { set(this.typicalP, typicalP) }
-
-  /** Set the temperature
-    *
-    * @group setParam
-    */
-  def setTemperature(temperature: Float): this.type = { set(this.temperature, temperature) }
-
-  /** Set the dynamic temperature range
-    *
-    * @group setParam
-    */
-  def setDynamicTemperatureRange(dynatempRange: Float): this.type = {
-    set(this.dynamicTemperatureRange, dynatempRange)
-  }
-
-  /** Set the dynamic temperature exponent
-    *
-    * @group setParam
-    */
-  def setDynamicTemperatureExponent(dynatempExponent: Float): this.type = {
-    set(this.dynamicTemperatureExponent, dynatempExponent)
-  }
-
-  /** Set the last n tokens to consider for penalties
-    *
-    * @group setParam
-    */
-  def setRepeatLastN(repeatLastN: Int): this.type = { set(this.repeatLastN, repeatLastN) }
-
-  /** Set the penalty of repeated sequences of tokens
-    *
-    * @group setParam
-    */
-  def setRepeatPenalty(repeatPenalty: Float): this.type = {
-    set(this.repeatPenalty, repeatPenalty)
-  }
-
-  /** Set the repetition alpha frequency penalty
-    *
-    * @group setParam
-    */
-  def setFrequencyPenalty(frequencyPenalty: Float): this.type = {
-    set(this.frequencyPenalty, frequencyPenalty)
-  }
-
-  /** Set the repetition alpha presence penalty
-    *
-    * @group setParam
-    */
-  def setPresencePenalty(presencePenalty: Float): this.type = {
-    set(this.presencePenalty, presencePenalty)
-  }
-
-  /** Set MiroStat sampling strategies.
-    *
-    *   - DISABLED: No MiroStat
-    *   - V1: MiroStat V1
-    *   - V2: MiroStat V2
-    *
-    * @group setParam
-    */
-  def setMiroStat(mirostat: String): this.type = set(this.miroStat, mirostat)
-
-  /** Set the MiroStat target entropy, parameter tau
-    *
-    * @group setParam
-    */
-  def setMiroStatTau(mirostatTau: Float): this.type = { set(this.miroStatTau, mirostatTau) }
-
-  /** Set the MiroStat learning rate, parameter eta
-    *
-    * @group setParam
-    */
-  def setMiroStatEta(mirostatEta: Float): this.type = { set(this.miroStatEta, mirostatEta) }
-
-  /** Set whether to penalize newline tokens
-    *
-    * @group setParam
-    */
-  def setPenalizeNl(penalizeNl: Boolean): this.type = { set(this.penalizeNl, penalizeNl) }
-
-  /** Set the number of tokens to keep from the initial prompt
-    *
-    * @group setParam
-    */
-  def setNKeep(nKeep: Int): this.type = { set(this.nKeep, nKeep) }
-
-  /** Set the RNG seed
-    *
-    * @group setParam
-    */
-  def setSeed(seed: Int): this.type = { set(this.seed, seed) }
-
-  /** Set the amount top tokens probabilities to output if greater than 0.
-    *
-    * @group setParam
-    */
-  def setNProbs(nProbs: Int): this.type = { set(this.nProbs, nProbs) }
-
-  /** Set the amount of tokens the samplers should return at least (0 = disabled)
-    *
-    * @group setParam
-    */
-  def setMinKeep(minKeep: Int): this.type = { set(this.minKeep, minKeep) }
-
-  /** Set BNF-like grammar to constrain generations
-    *
-    * @group setParam
-    */
-  def setGrammar(grammar: String): this.type = { set(this.grammar, grammar) }
-
-  /** Override which part of the prompt is penalized for repetition.
-    *
-    * @group setParam
-    */
-  def setPenaltyPrompt(penaltyPrompt: String): this.type = {
-    set(this.penaltyPrompt, penaltyPrompt)
-  }
-
-  /** Set whether to ignore end of stream token and continue generating (implies --logit-bias
-    * 2-inf)
-    *
-    * @group setParam
-    */
-  def setIgnoreEos(ignoreEos: Boolean): this.type = { set(this.ignoreEos, ignoreEos) }
-
-  /** Set the tokens to disable during completion.
-    *
-    * @group setParam
-    */
-  def setTokenBias(tokenBias: Map[String, Float]): this.type = {
-    set(this.tokenBias, tokenBias)
-  }
-
-  /** Set the tokens to disable during completion. (Override for PySpark)
-    *
-    * @group setParam
-    */
-  def setTokenBias(tokenBias: java.util.HashMap[String, java.lang.Double]): this.type = {
-    val scalaTokenBias = tokenBias.asScala.map { case (k, v) => k -> v.floatValue() }
-    set(this.tokenBias, scalaTokenBias.toMap)
-  }
-
-  /** Set the token ids to disable in the completion.
-    *
-    * @group setParam
-    */
-  def setTokenIdBias(tokenIdBias: Map[Int, Float]): this.type = {
-    set(this.tokenIdBias, tokenIdBias)
-  }
-
-  /** Set the token ids to disable in the completion. (Override for PySpark)
-    *
-    * @group setParam
-    */
-  def setTokenIdBias(tokenIdBias: java.util.HashMap[Integer, java.lang.Double]): this.type = {
-    val scalaTokenIdBias = tokenIdBias.asScala.map { case (k, v) => k.toInt -> v.toFloat }
-    set(this.tokenIdBias, scalaTokenIdBias.toMap)
-  }
-
-  /** Set the token ids to disable in the completion. This corresponds to `setTokenBias` with a
-    * value of `Float.NEGATIVE_INFINITY`.
-    *
-    * @group setParam
-    */
-  def setDisableTokenIds(disableTokenIds: Array[Int]): this.type = {
-    set(this.disableTokenIds, disableTokenIds)
-  }
-
-  /** Set strings upon seeing which token generation is stopped
-    *
-    * @group setParam
-    */
-  def setStopStrings(stopStrings: Array[String]): this.type = {
-    set(this.stopStrings, stopStrings)
-  }
-
-  /** Set which samplers to use for token generation in the given order .
-    *
-    * Available Samplers are:
-    *
-    *   - TOP_K: Top-k sampling
-    *   - TFS_Z: Tail free sampling
-    *   - TYPICAL_P: Locally typical sampling p
-    *   - TOP_P: Top-p sampling
-    *   - MIN_P: Min-p sampling
-    *   - TEMPERATURE: Temperature sampling
-    * @group setParam
-    */
-  def setSamplers(samplers: Array[String]): this.type = { set(this.samplers, samplers) }
-
-  /** Set whether or not generate should apply a chat template
-    *
-    * @group setParam
-    */
-  def setUseChatTemplate(useChatTemplate: Boolean): this.type = {
-    set(this.useChatTemplate, useChatTemplate)
-  }
-
-  // ---------------- GETTERS ----------------
-  /** @group getParam */
-  def getInputPrefix: String = $(inputPrefix)
-
-  /** @group getParam */
-  def getInputSuffix: String = $(inputSuffix)
-
-  /** @group getParam */
-  def getCachePrompt: Boolean = $(cachePrompt)
-
-  def getNPredict: Int = $(nPredict)
-
-  /** @group getParam */
-  def getTopK: Int = $(topK)
-
-  /** @group getParam */
-  def getTopP: Float = $(topP)
-
-  /** @group getParam */
-  def getMinP: Float = $(minP)
-
-  /** @group getParam */
-  def getTfsZ: Float = $(tfsZ)
-
-  /** @group getParam */
-  def getTypicalP: Float = $(typicalP)
-
-  /** @group getParam */
-  def getTemperature: Float = $(temperature)
-
-  /** @group getParam */
-  def getDynamicTemperatureRange: Float = $(dynamicTemperatureRange)
-
-  /** @group getParam */
-  def getDynamicTemperatureExponent: Float = $(dynamicTemperatureExponent)
-
-  /** @group getParam */
-  def getRepeatLastN: Int = $(repeatLastN)
-
-  /** @group getParam */
-  def getRepeatPenalty: Float = $(repeatPenalty)
-
-  /** @group getParam */
-  def getFrequencyPenalty: Float = $(frequencyPenalty)
-
-  /** @group getParam */
-  def getPresencePenalty: Float = $(presencePenalty)
-
-  /** @group getParam */
-  def getMiroStat: String = $(miroStat)
-
-  /** @group getParam */
-  def getMiroStatTau: Float = $(miroStatTau)
-
-  /** @group getParam */
-  def getMiroStatEta: Float = $(miroStatEta)
-
-  /** @group getParam */
-  def getPenalizeNl: Boolean = $(penalizeNl)
-
-  /** @group getParam */
-  def getNKeep: Int = $(nKeep)
-
-  /** @group getParam */
-  def getSeed: Int = $(seed)
-
-  /** @group getParam */
-  def getNProbs: Int = $(nProbs)
-
-  /** @group getParam */
-  def getMinKeep: Int = $(minKeep)
-
-  /** @group getParam */
-  def getGrammar: String = $(grammar)
-
-  /** @group getParam */
-  def getPenaltyPrompt: String = $(penaltyPrompt)
-
-  /** @group getParam */
-  def getIgnoreEos: Boolean = $(ignoreEos)
-
-  /** @group getParam */
-  def getTokenIdBias: Map[Int, Float] = $$(tokenIdBias)
-
-  /** @group getParam */
-  def getTokenBias: Map[String, Float] = $$(tokenBias)
-
-  /** @group getParam */
-  def getDisableTokenIds: Array[Int] = $(disableTokenIds)
-
-  /** @group getParam */
-  def getStopStrings: Array[String] = $(stopStrings)
-
-  /** @group getParam */
-  def getSamplers: Array[String] = $(samplers)
-
-  /** @group getParam */
-  def getUseChatTemplate: Boolean = $(useChatTemplate)
-
-  protected def getModelParameters: ModelParameters = {
-    val modelParameters = new ModelParameters().setContinuousBatching(true) // Always enabled
-
-    if (isDefined(chatTemplate)) modelParameters.setChatTemplate($(chatTemplate))
-    if (isDefined(defragmentationThreshold))
-      modelParameters.setDefragmentationThreshold($(defragmentationThreshold))
-    if (isDefined(embedding)) modelParameters.setEmbedding($(embedding))
-    if (isDefined(flashAttention)) modelParameters.setFlashAttention($(flashAttention))
-    if (isDefined(gpuSplitMode))
-      modelParameters.setSplitMode(GpuSplitMode.valueOf($(gpuSplitMode)))
-    if (isDefined(grpAttnN)) modelParameters.setGrpAttnN($(grpAttnN))
-    if (isDefined(grpAttnW)) modelParameters.setGrpAttnN($(grpAttnW))
-    if (isDefined(inputPrefixBos)) modelParameters.setInputPrefixBos($(inputPrefixBos))
-    if (isDefined(lookupCacheDynamicFilePath))
-      modelParameters.setLookupCacheDynamicFilePath($(lookupCacheDynamicFilePath))
-    if (isDefined(lookupCacheStaticFilePath))
-      modelParameters.setLookupCacheStaticFilePath($(lookupCacheStaticFilePath))
-    if (isDefined(mainGpu)) modelParameters.setMainGpu($(mainGpu))
-    if (isDefined(modelDraft)) modelParameters.setModelDraft($(modelDraft))
-    if (isDefined(nBatch)) modelParameters.setNBatch($(nBatch))
-    if (isDefined(nChunks)) modelParameters.setNChunks($(nChunks))
-    if (isDefined(nCtx)) modelParameters.setNCtx($(nCtx))
-    if (isDefined(nDraft)) modelParameters.setNDraft($(nDraft))
-    if (isDefined(nGpuLayers)) modelParameters.setNGpuLayers($(nGpuLayers))
-    if (isDefined(nGpuLayersDraft)) modelParameters.setNGpuLayersDraft($(nGpuLayersDraft))
-    if (isDefined(nSequences)) modelParameters.setNSequences($(nSequences))
-    if (isDefined(nThreads)) modelParameters.setNThreads($(nThreads))
-    if (isDefined(nThreadsBatch)) modelParameters.setNThreadsBatch($(nThreadsBatch))
-    if (isDefined(nThreadsBatchDraft))
-      modelParameters.setNThreadsBatchDraft($(nThreadsBatchDraft))
-    if (isDefined(nThreadsDraft)) modelParameters.setNThreadsDraft($(nThreadsDraft))
-    if (isDefined(nUbatch)) modelParameters.setNUbatch($(nUbatch))
-    if (isDefined(noKvOffload)) modelParameters.setNoKvOffload($(noKvOffload))
-    if (isDefined(numaStrategy)) modelParameters.setNuma(NumaStrategy.valueOf($(numaStrategy)))
-    if (isDefined(pSplit)) modelParameters.setPSplit($(pSplit))
-    if (isDefined(poolingType))
-      modelParameters.setPoolingType(PoolingType.valueOf($(poolingType)))
-    if (isDefined(ropeFreqBase)) modelParameters.setRopeFreqBase($(ropeFreqBase))
-    if (isDefined(ropeFreqScale)) modelParameters.setRopeFreqScale($(ropeFreqScale))
-    if (isDefined(ropeScalingType))
-      modelParameters.setRopeScalingType(RopeScalingType.valueOf($(ropeScalingType)))
-    if (isDefined(systemPrompt)) modelParameters.setSystemPrompt($(systemPrompt))
-    if (isDefined(tensorSplit)) modelParameters.setTensorSplit($(tensorSplit).map(_.toFloat))
-    if (isDefined(useMlock)) modelParameters.setUseMlock($(useMlock))
-    if (isDefined(useMmap)) modelParameters.setUseMmap($(useMmap))
-    if (isDefined(yarnAttnFactor)) modelParameters.setYarnAttnFactor($(yarnAttnFactor))
-    if (isDefined(yarnBetaFast)) modelParameters.setYarnBetaFast($(yarnBetaFast))
-    if (isDefined(yarnBetaSlow)) modelParameters.setYarnBetaSlow($(yarnBetaSlow))
-    if (isDefined(yarnExtFactor)) modelParameters.setYarnExtFactor($(yarnExtFactor))
-    if (isDefined(yarnOrigCtx)) modelParameters.setYarnOrigCtx($(yarnOrigCtx))
-    if (loraAdapters.isSet) {
-      val loraAdaptersMap: mutable.Map[String, java.lang.Float] =
-        mutable.Map($$(loraAdapters).map { case (key, value) =>
-          (key, float2Float(value))
-        }.toSeq: _*)
-      modelParameters.setLoraAdapters(loraAdaptersMap.asJava)
-    } // Need to convert to mutable map first
-
-    modelParameters
-  }
-
-  protected def getInferenceParameters: InferenceParameters = {
-    val inferenceParams = new InferenceParameters("")
-    if (isDefined(cachePrompt)) inferenceParams.setCachePrompt($(cachePrompt))
-    if (isDefined(disableTokenIds)) {
-      val javaCollection: java.util.Collection[Integer] =
-        $(disableTokenIds).map(int2Integer).toSeq.asJava
-      inferenceParams.disableTokenIds(javaCollection)
-    }
-    if (isDefined(dynamicTemperatureExponent))
-      inferenceParams.setDynamicTemperatureExponent($(dynamicTemperatureExponent))
-    if (isDefined(dynamicTemperatureRange))
-      inferenceParams.setDynamicTemperatureRange($(dynamicTemperatureRange))
-    if (isDefined(frequencyPenalty)) inferenceParams.setFrequencyPenalty($(frequencyPenalty))
-    if (isDefined(grammar)) inferenceParams.setGrammar($(grammar))
-    if (isDefined(ignoreEos)) inferenceParams.setIgnoreEos($(ignoreEos))
-    if (isDefined(inputPrefix)) inferenceParams.setInputPrefix($(inputPrefix))
-    if (isDefined(inputSuffix)) inferenceParams.setInputSuffix($(inputSuffix))
-    if (isDefined(minKeep)) inferenceParams.setMinKeep($(minKeep))
-    if (isDefined(minP)) inferenceParams.setMinP($(minP))
-    if (isDefined(miroStat)) inferenceParams.setMiroStat(MiroStat.valueOf($(miroStat)))
-    if (isDefined(miroStatEta)) inferenceParams.setMiroStatEta($(miroStatEta))
-    if (isDefined(miroStatTau)) inferenceParams.setMiroStatTau($(miroStatTau))
-    if (isDefined(nKeep)) inferenceParams.setNKeep($(nKeep))
-    if (isDefined(nPredict)) inferenceParams.setNPredict($(nPredict))
-    if (isDefined(nProbs)) inferenceParams.setNProbs($(nProbs))
-    if (isDefined(penalizeNl)) inferenceParams.setPenalizeNl($(penalizeNl))
-    if (isDefined(penaltyPrompt)) inferenceParams.setPenaltyPrompt($(penaltyPrompt))
-    if (isDefined(presencePenalty)) inferenceParams.setPresencePenalty($(presencePenalty))
-    if (isDefined(repeatLastN)) inferenceParams.setRepeatLastN($(repeatLastN))
-    if (isDefined(repeatPenalty)) inferenceParams.setRepeatPenalty($(repeatPenalty))
-    if (isDefined(samplers)) inferenceParams.setSamplers($(samplers).map(Sampler.valueOf): _*)
-    if (isDefined(seed)) inferenceParams.setSeed($(seed))
-    if (isDefined(stopStrings)) inferenceParams.setStopStrings($(stopStrings): _*)
-    if (isDefined(temperature)) inferenceParams.setTemperature($(temperature))
-    if (isDefined(tfsZ)) inferenceParams.setTfsZ($(tfsZ))
-    if (isDefined(topK)) inferenceParams.setTopK($(topK))
-    if (isDefined(topP)) inferenceParams.setTopP($(topP))
-    if (isDefined(typicalP)) inferenceParams.setTypicalP($(typicalP))
-    if (isDefined(useChatTemplate)) inferenceParams.setUseChatTemplate($(useChatTemplate))
-    if (tokenBias.isSet) {
-      val tokenBiasMap: mutable.Map[String, java.lang.Float] = mutable.Map($$(tokenBias).map {
-        case (key, value) => (key, float2Float(value))
-      }.toSeq: _*)
-      inferenceParams.setTokenBias(tokenBiasMap.asJava)
-    }
-    if (tokenIdBias.isSet) {
-      val tokenIdBiasMap: mutable.Map[Integer, java.lang.Float] =
-        mutable.Map($$(tokenIdBias).map { case (key, value) =>
-          (int2Integer(key), float2Float(value))
-        }.toSeq: _*)
-      inferenceParams.setTokenIdBias(tokenIdBiasMap.asJava)
-    }
-
-    inferenceParams
-  }
-
-  // ---------------- METADATA ----------------
-  val metadata =
-    new Param[String](this, "metadata", "Set the metadata for the model").setProtected()
-
-  /** Set the metadata for the model
-    * @group setParam
-    */
-  def setMetadata(metadata: String): this.type = { set(this.metadata, metadata) }
-
-  /** Get the metadata for the model
-    * @group getParam
-    */
-  def getMetadata: String = $(metadata)
-}
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala
index 27daac826bb595..efbd3a288896c1 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala
@@ -828,4 +828,9 @@ package object annotator {
 
   object SnowFlakeEmbeddings extends ReadablePretrainedSnowFlakeModel with ReadSnowFlakeDLModel
 
+  type AutoGGUFEmbeddings = com.johnsnowlabs.nlp.embeddings.AutoGGUFEmbeddings
+  object AutoGGUFEmbeddings
+      extends ReadablePretrainedAutoGGUFEmbeddings
+      with ReadAutoGGUFEmbeddings
+
 }
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala
index 405e48f6d1195b..3caf4bdc0e8be2 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala
@@ -23,14 +23,12 @@ import com.johnsnowlabs.nlp.util.io.ResourceHelper
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.sql.SparkSession
-import org.json4s.DefaultFormats
-import org.json4s.jackson.JsonMethods
 
 /** Annotator that uses the llama.cpp library to generate text completions with large language
   * models.
   *
-  * For settable parameters, and their explanations, see [[HasLlamaCppProperties]] and refer to
-  * the llama.cpp documentation of
+  * For settable parameters, and their explanations, see [[HasLlamaCppInferenceProperties]],
+  * [[HasLlamaCppModelProperties]] and refer to the llama.cpp documentation of
   * [[https://github.com/ggerganov/llama.cpp/tree/7d5e8777ae1d21af99d4f95be10db4870720da91/examples/server server.cpp]]
   * for more information.
   *
@@ -118,7 +116,8 @@ class AutoGGUFModel(override val uid: String)
     extends AnnotatorModel[AutoGGUFModel]
     with HasBatchedAnnotate[AutoGGUFModel]
     with HasEngine
-    with HasLlamaCppProperties
+    with HasLlamaCppModelProperties
+    with HasLlamaCppInferenceProperties
     with HasProtectedParams {
 
   override val outputAnnotatorType: AnnotatorType = AnnotatorType.DOCUMENT
@@ -131,10 +130,6 @@ class AutoGGUFModel(override val uid: String)
 
   private var _model: Option[Broadcast[GGUFWrapper]] = None
 
-  // Values for automatic GPU support
-  private val defaultGpuLayers = 1000
-  private val defaultMainGpu = 0
-
   /** @group getParam */
   def getModelIfNotSet: GGUFWrapper = _model.get.value
 
@@ -145,18 +140,18 @@ class AutoGGUFModel(override val uid: String)
     }
 
     // Entrypoint for models. Automatically set GPU support if detected.
-    val usingGPUJar: Boolean = spark.sparkContext.listJars.exists(_.contains("spark-nlp-gpu"))
-    if (usingGPUJar) {
-      logger.info("Using GPU jar. Offloading all layers to GPU.")
-      setMainGpu(defaultMainGpu)
-      setNGpuLayers(defaultGpuLayers)
-    }
-    this
+    setGpuSupportIfAvailable(spark)
   }
 
   private[johnsnowlabs] def setEngine(engineName: String): this.type = set(engine, engineName)
 
-  setDefault(engine -> LlamaCPP.name)
+  setDefault(
+    engine -> LlamaCPP.name,
+    useChatTemplate -> true,
+    nCtx -> 4096,
+    nBatch -> 512,
+    embedding -> false,
+    nPredict -> 100)
 
   override def onWrite(path: String, spark: SparkSession): Unit = {
     super.onWrite(path, spark)
@@ -173,6 +168,7 @@ class AutoGGUFModel(override val uid: String)
   override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = {
     val annotations: Seq[Annotation] = batchedAnnotations.flatten
     if (annotations.nonEmpty) {
+      val annotationsText = annotations.map(_.result)
 
       val modelParams =
         getModelParameters.setNParallel(getBatchSize) // set parallel decoding to batch size
@@ -180,18 +176,36 @@ class AutoGGUFModel(override val uid: String)
 
       val model: LlamaModel = getModelIfNotSet.getSession(modelParams)
 
-      val annotationsText = annotations.map(_.result)
-
-      val (completedTexts: Array[String], metadata: Map[String, String]) =
-        try {
-          (model.requestBatchCompletion(annotationsText.toArray, inferenceParams), Map.empty)
-        } catch {
-          case e: Exception =>
-            logger.error("Error in llama.cpp batch completion", e)
-            (Array[String](), Map("exception" -> e.getMessage))
+      if (getEmbedding) {
+        // Return embeddings in annotation
+        val (embeddings: Array[Array[Float]], metadata: Map[String, String]) =
+          try {
+            (model.requestBatchEmbeddings(annotationsText.toArray), Map.empty)
+          } catch {
+            case e: Exception =>
+              logger.error("Error in llama.cpp embeddings", e)
+              (Array.empty[Array[Float]], Map("llamacpp_exception" -> e.getMessage))
+          }
+        // Choose empty text for result annotations
+        annotations.zip(embeddings).map { case (annotation, embedding) =>
+          Seq(
+            new Annotation(
+              annotatorType = annotation.annotatorType,
+              begin = annotation.begin,
+              end = annotation.end,
+              result = annotation.result,
+              metadata = annotation.metadata ++ metadata,
+              embeddings = embedding))
         }
-
-      val result: Seq[Seq[Annotation]] =
+      } else {
+        val (completedTexts: Array[String], metadata: Map[String, String]) =
+          try {
+            (model.requestBatchCompletion(annotationsText.toArray, inferenceParams), Map.empty)
+          } catch {
+            case e: Exception =>
+              logger.error("Error in llama.cpp batch completion", e)
+              (Array[String](), Map("llamacpp_exception" -> e.getMessage))
+          }
         annotations.zip(completedTexts).map { case (annotation, text) =>
           Seq(
             new Annotation(
@@ -201,18 +215,9 @@ class AutoGGUFModel(override val uid: String)
               text,
               annotation.metadata ++ metadata))
         }
-      result
+      }
     } else Seq(Seq.empty[Annotation])
   }
-
-  def getMetadataMap: Map[String, String] = {
-    val metadataJsonString = getMetadata
-    if (metadataJsonString.isEmpty) Map.empty
-    else {
-      implicit val formats: DefaultFormats.type = DefaultFormats
-      JsonMethods.parse(metadataJsonString).extract[Map[String, String]]
-    }
-  }
 }
 
 trait ReadablePretrainedAutoGGUFModel
diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala
new file mode 100644
index 00000000000000..98aa10eb8b31ac
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala
@@ -0,0 +1,241 @@
+/*
+ * Copyright 2017-2024 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.johnsnowlabs.nlp.embeddings
+
+import com.johnsnowlabs.ml.gguf.GGUFWrapper
+import com.johnsnowlabs.ml.util.LlamaCPP
+import com.johnsnowlabs.nlp._
+import com.johnsnowlabs.nlp.llama.LlamaModel
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.SparkSession
+
+/** Annotator that uses the llama.cpp library to generate text embeddings with large language
+  * models.
+  *
+  * The type of embedding pooling can be set with the `setPoolingType` method. The default is
+  * `"MEAN"`. The available options are `"NONE"`, `"MEAN"`, `"CLS"`, and `"LAST"`.
+  *
+  * For all settable parameters, and their explanations, see [[HasLlamaCppModelProperties]].
+  *
+  * Pretrained models can be loaded with `pretrained` of the companion object:
+  * {{{
+  * val autoGGUFModel = AutoGGUFEmbeddings.pretrained()
+  *   .setInputCols("document")
+  *   .setOutputCol("embeddings")
+  * }}}
+  * The default model is `"nomic-embed-text-v1.5.Q8_0.gguf"`, if no name is provided.
+  *
+  * For available pretrained models please see the [[https://sparknlp.org/models Models Hub]].
+  *
+  * For extended examples of usage, see the
+  * [[https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTest.scala AutoGGUFEmbeddingsTest]]
+  * and the
+  * [[https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFEmbeddings.ipynb example notebook]].
+  *
+  * ==Note==
+  * To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set
+  * the number of GPU layers with the `setNGpuLayers` method.
+  *
+  * When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers`
+  * according to your hardware to avoid out-of-memory errors.
+  *
+  * ==Example==
+  *
+  * {{{
+  * import com.johnsnowlabs.nlp.base._
+  * import com.johnsnowlabs.nlp.annotator._
+  * import org.apache.spark.ml.Pipeline
+  * import spark.implicits._
+  *
+  * val document = new DocumentAssembler().setInputCol("text").setOutputCol("document")
+  *
+  * val autoGGUFModel = AutoGGUFEmbeddings
+  *   .pretrained()
+  *   .setInputCols("document")
+  *   .setOutputCol("embeddings")
+  *   .setBatchSize(4)
+  *   .setPoolingType("MEAN")
+  *
+  * val pipeline = new Pipeline().setStages(Array(document, autoGGUFModel))
+  *
+  * val data = Seq(
+  *   "The moons of Jupiter are 77 in total, with 79 confirmed natural satellites and 2 man-made ones.")
+  *   .toDF("text")
+  * val result = pipeline.fit(data).transform(data)
+  * result.select("embeddings.embeddings").show(truncate = false)
+  * +--------------------------------------------------------------------------------+
+  * |                                                                      embeddings|
+  * +--------------------------------------------------------------------------------+
+  * |[[-0.034486726, 0.07770534, -0.15982522, -0.017873349, 0.013914132, 0.0365736...|
+  * +--------------------------------------------------------------------------------+
+  * }}}
+  *
+  * @param uid
+  *   required uid for storing annotator to disk
+  * @groupname anno Annotator types
+  * @groupdesc anno
+  *   Required input and expected output annotator types
+  * @groupname Ungrouped Members
+  * @groupname param Parameters
+  * @groupname setParam Parameter setters
+  * @groupname getParam Parameter getters
+  * @groupname Ungrouped Members
+  * @groupprio param  1
+  * @groupprio anno  2
+  * @groupprio Ungrouped 3
+  * @groupprio setParam  4
+  * @groupprio getParam  5
+  * @groupdesc param
+  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
+  *   parameter values through setters and getters, respectively.
+  */
+class AutoGGUFEmbeddings(override val uid: String)
+    extends AnnotatorModel[AutoGGUFEmbeddings]
+    with HasBatchedAnnotate[AutoGGUFEmbeddings]
+    with HasEngine
+    with HasLlamaCppModelProperties
+    with HasProtectedParams {
+
+  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(AnnotatorType.DOCUMENT)
+  override val outputAnnotatorType: AnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS
+
+  /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator
+    * type
+    */
+  def this() = this(Identifiable.randomUID("AutoGGUFModel"))
+
+  private var _model: Option[Broadcast[GGUFWrapper]] = None
+
+  /** @group getParam */
+  def getModelIfNotSet: GGUFWrapper = _model.get.value
+
+  /** @group setParam */
+  def setModelIfNotSet(spark: SparkSession, wrapper: GGUFWrapper): this.type = {
+    if (_model.isEmpty) {
+      _model = Some(spark.sparkContext.broadcast(wrapper))
+    }
+
+    setGpuSupportIfAvailable(spark)
+  }
+
+  private[johnsnowlabs] def setEngine(engineName: String): this.type = set(engine, engineName)
+
+  setDefault(
+    engine -> LlamaCPP.name,
+    embedding -> true,
+    poolingType -> "MEAN",
+    nCtx -> 4096,
+    nBatch -> 512)
+
+  override def onWrite(path: String, spark: SparkSession): Unit = {
+    super.onWrite(path, spark)
+    getModelIfNotSet.saveToFile(path)
+  }
+
+  /** Completes the batch of annotations.
+    *
+    * @param batchedAnnotations
+    *   Annotations (single element arrays) in batches
+    * @return
+    *   Completed text sequences
+    */
+  override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = {
+    require(
+      getEmbedding,
+      "Embeddings have been manually disabled. Please enable them with setEmbedding(true).")
+    val annotations: Seq[Annotation] = batchedAnnotations.flatten
+    if (annotations.nonEmpty) {
+
+      val modelParams =
+        getModelParameters.setNParallel(getBatchSize) // set parallel decoding to batch size
+
+      val model: LlamaModel = getModelIfNotSet.getSession(modelParams)
+
+      val annotationsText = annotations.map(_.result)
+
+      // Return embeddings in annotation
+      val (embeddings: Array[Array[Float]], metadata: Map[String, String]) =
+        try {
+          (model.requestBatchEmbeddings(annotationsText.toArray), Map.empty)
+        } catch {
+          case e: Exception =>
+            logger.error("Error in llama.cpp embeddings", e)
+            (Array.empty[Array[Float]], Map("llamacpp_exception" -> e.getMessage))
+        }
+
+      // Choose empty text for result annotations
+      annotations.zip(embeddings).map { case (annotation, embedding) =>
+        Seq(
+          new Annotation(
+            annotatorType = annotation.annotatorType,
+            begin = annotation.begin,
+            end = annotation.end,
+            result = annotation.result,
+            metadata = annotation.metadata ++ metadata,
+            embeddings = embedding))
+      }
+    } else Seq(Seq.empty[Annotation])
+  }
+}
+
+trait ReadablePretrainedAutoGGUFEmbeddings
+    extends ParamsAndFeaturesReadable[AutoGGUFEmbeddings]
+    with HasPretrained[AutoGGUFEmbeddings] {
+  override val defaultModelName: Some[String] = Some("nomic-embed-text-v1.5.Q8_0.gguf")
+  override val defaultLang: String = "en"
+
+  /** Java compliant-overrides */
+  override def pretrained(): AutoGGUFEmbeddings = super.pretrained()
+
+  override def pretrained(name: String): AutoGGUFEmbeddings = super.pretrained(name)
+
+  override def pretrained(name: String, lang: String): AutoGGUFEmbeddings =
+    super.pretrained(name, lang)
+
+  override def pretrained(name: String, lang: String, remoteLoc: String): AutoGGUFEmbeddings =
+    super.pretrained(name, lang, remoteLoc)
+}
+
+trait ReadAutoGGUFEmbeddings {
+  this: ParamsAndFeaturesReadable[AutoGGUFEmbeddings] =>
+
+  def readModel(instance: AutoGGUFEmbeddings, path: String, spark: SparkSession): Unit = {
+    val model: GGUFWrapper = GGUFWrapper.readModel(path, spark)
+    instance.setModelIfNotSet(spark, model)
+  }
+
+  addReader(readModel)
+
+  def loadSavedModel(modelPath: String, spark: SparkSession): AutoGGUFEmbeddings = {
+    // TODO potentially enable download from HF-URLS
+    val localPath: String = ResourceHelper.copyToLocal(modelPath)
+    val annotatorModel = new AutoGGUFEmbeddings()
+    annotatorModel
+      .setModelIfNotSet(spark, GGUFWrapper.read(spark, localPath))
+      .setEngine(LlamaCPP.name)
+
+    val metadata = LlamaModel.getMetadataFromFile(localPath)
+    if (metadata.nonEmpty) annotatorModel.setMetadata(metadata)
+    annotatorModel
+  }
+}
+
+/** This is the companion object of [[AutoGGUFEmbeddings]]. Please refer to that class for the
+  * documentation.
+  */
+object AutoGGUFEmbeddings extends ReadablePretrainedAutoGGUFEmbeddings with ReadAutoGGUFEmbeddings
diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala
index 145bcc67f26b35..b359523c202f37 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala
@@ -690,7 +690,8 @@ object PythonResourceDownloader {
     "SnowFlakeEmbeddings" -> SnowFlakeEmbeddings,
     "CamemBertForZeroShotClassification" -> CamemBertForZeroShotClassification,
     "BertForMultipleChoice" -> BertForMultipleChoice,
-    "PromptAssembler" -> PromptAssembler)
+    "PromptAssembler" -> PromptAssembler,
+    "AutoGGUFEmbeddings" -> AutoGGUFEmbeddings)
 
   // List pairs of types such as the one with key type can load a pretrained model from the value type
   val typeMapper: Map[String, String] = Map("ZeroShotNerModel" -> "RoBertaForQuestionAnswering")
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala
index b4234f24197b7c..f755b76dfa2e72 100644
--- a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala
@@ -14,9 +14,6 @@ class AutoGGUFModelTest extends AnyFlatSpec {
 
   behavior of "AutoGGUFModelTest"
 
-  // Set Spark Debug level
-  ResourceHelper.spark.sparkContext.setLogLevel("INFO")
-
   lazy val documentAssembler = new DocumentAssembler()
     .setInputCol("text")
     .setOutputCol("document")
diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTestSpec.scala
new file mode 100644
index 00000000000000..b7c4544bdbd87f
--- /dev/null
+++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTestSpec.scala
@@ -0,0 +1,86 @@
+package com.johnsnowlabs.nlp.embeddings
+
+import com.johnsnowlabs.nlp.Annotation
+import com.johnsnowlabs.nlp.base.DocumentAssembler
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
+import com.johnsnowlabs.tags.SlowTest
+import org.apache.spark.ml.Pipeline
+import org.scalatest.flatspec.AnyFlatSpec
+
+class AutoGGUFEmbeddingsTestSpec extends AnyFlatSpec {
+  import ResourceHelper.spark.implicits._
+
+  behavior of "AutoGGUFEmbeddings"
+
+  lazy val documentAssembler = new DocumentAssembler()
+    .setInputCol("text")
+    .setOutputCol("document")
+
+  lazy val data = Seq(
+    "The moons of Jupiter are ", // "The moons of Jupiter are 77 in total, with 79 confirmed natural satellites and 2 man-made ones. The four"
+    "Earth is ", // "Earth is 4.5 billion years old. It has been home to countless species, some of which have gone extinct, while others have evolved into"
+    "The moon is ", // "The moon is 1/400th the size of the sun. The sun is 1.39 million kilometers in diameter, while"
+    "The sun is " //
+  ).toDF("text").repartition(1)
+
+  // nomic-embed-text-v1.5.Q8_0.gguf
+  def model(poolingType: String): AutoGGUFEmbeddings = AutoGGUFEmbeddings
+    .pretrained()
+    .setInputCols("document")
+    .setOutputCol("embeddings")
+    .setBatchSize(4)
+    .setPoolingType(poolingType)
+
+  def pipeline(embedModel: AutoGGUFEmbeddings = model("MEAN")) =
+    new Pipeline().setStages(Array(documentAssembler, embedModel))
+
+  it should "produce embeddings" taggedAs SlowTest in {
+    val result = pipeline().fit(data).transform(data)
+    val collected = Annotation.collect(result, "embeddings")
+
+    collected.foreach { annotations =>
+      val embeddings = annotations.head.embeddings
+      assert(embeddings != null, "embeddings should not be null")
+      assert(
+        embeddings.sum > 0.0,
+        "embeddings should not be zero. Was there an error on llama.cpp side?")
+    }
+  }
+
+  it should "produce embeddings of different pooling types" taggedAs SlowTest in {
+    def testPoolingType(poolingType: String): Unit = {
+      val result = pipeline(model(poolingType)).fit(data).transform(data)
+      val embeddings: Array[Float] = Annotation.collect(result, "embeddings").head.head.embeddings
+
+      assert(embeddings != null, "embeddings should not be null")
+      assert(
+        embeddings.sum > 0.0,
+        "embeddings should not be zero. Was there an error on llama.cpp side?")
+    }
+
+    Seq("NONE", "MEAN", "CLS", "LAST").foreach(testPoolingType)
+  }
+
+  it should "be serializable" taggedAs SlowTest in {
+
+    val data = Seq("Hello, I am a").toDF("text")
+    lazy val pipeline = new Pipeline().setStages(Array(documentAssembler, model("MEAN")))
+
+    val pipelineModel = pipeline.fit(data)
+    val savePath = "./tmp_autogguf_model"
+    pipelineModel.stages.last
+      .asInstanceOf[AutoGGUFEmbeddings]
+      .write
+      .overwrite()
+      .save(savePath)
+
+    val loadedModel = AutoGGUFEmbeddings.load(savePath)
+    val newPipeline: Pipeline = new Pipeline().setStages(Array(documentAssembler, loadedModel))
+
+    newPipeline
+      .fit(data)
+      .transform(data)
+      .select("embeddings.embeddings")
+      .show(truncate = false)
+  }
+}