diff --git a/docs/en/annotator_entries/AutoGGUFEmbeddings.md b/docs/en/annotator_entries/AutoGGUFEmbeddings.md new file mode 100644 index 00000000000000..9c872393a515dc --- /dev/null +++ b/docs/en/annotator_entries/AutoGGUFEmbeddings.md @@ -0,0 +1,123 @@ +{%- capture title -%} +AutoGGUFEmbeddings +{%- endcapture -%} + +{%- capture description -%} +Annotator that uses the llama.cpp library to generate text embeddings with large language +models. + +The type of embedding pooling can be set with the `setPoolingType` method. The default is +`"MEAN"`. The available options are `"NONE"`, `"MEAN"`, `"CLS"`, and `"LAST"`. + +If the parameters are not set, the annotator will default to use the parameters provided by +the model. + +Pretrained models can be loaded with `pretrained` of the companion object: + +```scala +val autoGGUFEmbeddings = AutoGGUFEmbeddings.pretrained() + .setInputCols("document") + .setOutputCol("embeddings") +``` + +The default model is `"nomic-embed-text-v1.5.Q8_0.gguf"`, if no name is provided. + +For available pretrained models please see the [Models Hub](https://sparknlp.org/models). + +For extended examples of usage, see the +[AutoGGUFEmbeddingsTest](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFEmbeddingsTest.scala) +and the +[example notebook](https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFEmbeddings.ipynb). + +**Note**: To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set +the number of GPU layers with the `setNGpuLayers` method. + +When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers` +according to your hardware to avoid out-of-memory errors. +{%- endcapture -%} + +{%- capture input_anno -%} +DOCUMENT +{%- endcapture -%} + +{%- capture output_anno -%} +SENTENCE_EMBEDDINGS +{%- endcapture -%} + +{%- capture python_example -%} +>>> import sparknlp +>>> from sparknlp.base import * +>>> from sparknlp.annotator import * +>>> from pyspark.ml import Pipeline +>>> document = DocumentAssembler() \ +... .setInputCol("text") \ +... .setOutputCol("document") +>>> autoGGUFEmbeddings = AutoGGUFEmbeddings.pretrained() \ +... .setInputCols(["document"]) \ +... .setOutputCol("completions") \ +... .setBatchSize(4) \ +... .setNGpuLayers(99) \ +... .setPoolingType("MEAN") +>>> pipeline = Pipeline().setStages([document, autoGGUFEmbeddings]) +>>> data = spark.createDataFrame([["The moons of Jupiter are 77 in total, with 79 confirmed natural satellites and 2 man-made ones."]]).toDF("text") +>>> result = pipeline.fit(data).transform(data) +>>> result.select("completions").show() ++--------------------------------------------------------------------------------+ +| embeddings| ++--------------------------------------------------------------------------------+ +|[[-0.034486726, 0.07770534, -0.15982522, -0.017873349, 0.013914132, 0.0365736...| ++--------------------------------------------------------------------------------+ +{%- endcapture -%} + +{%- capture scala_example -%} +import com.johnsnowlabs.nlp.base._ +import com.johnsnowlabs.nlp.annotator._ +import org.apache.spark.ml.Pipeline +import spark.implicits._ + +val document = new DocumentAssembler().setInputCol("text").setOutputCol("document") + +val autoGGUFEmbeddings = AutoGGUFEmbeddings + .pretrained() + .setInputCols("document") + .setOutputCol("embeddings") + .setBatchSize(4) + .setPoolingType("MEAN") + +val pipeline = new Pipeline().setStages(Array(document, autoGGUFEmbeddings)) + +val data = Seq( + "The moons of Jupiter are 77 in total, with 79 confirmed natural satellites and 2 man-made ones.") + .toDF("text") +val result = pipeline.fit(data).transform(data) +result.select("embeddings.embeddings").show(1, truncate=80) ++--------------------------------------------------------------------------------+ +| embeddings| ++--------------------------------------------------------------------------------+ +|[[-0.034486726, 0.07770534, -0.15982522, -0.017873349, 0.013914132, 0.0365736...| ++--------------------------------------------------------------------------------+ +{%- endcapture -%} + +{%- capture api_link -%} +[AutoGGUFEmbeddings](/api/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings) +{%- endcapture -%} + +{%- capture python_api_link -%} +[AutoGGUFEmbeddings](/api/python/reference/autosummary/sparknlp/annotator/embeddings/auto_gguf_embeddings/index.html) +{%- endcapture -%} + +{%- capture source_link -%} +[AutoGGUFEmbeddings](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala) +{%- endcapture -%} + +{% include templates/anno_template.md +title=title +description=description +input_anno=input_anno +output_anno=output_anno +python_example=python_example +scala_example=scala_example +api_link=api_link +python_api_link=python_api_link +source_link=source_link +%} \ No newline at end of file diff --git a/docs/en/annotator_entries/AutoGGUF.md b/docs/en/annotator_entries/AutoGGUFModel.md similarity index 100% rename from docs/en/annotator_entries/AutoGGUF.md rename to docs/en/annotator_entries/AutoGGUFModel.md diff --git a/docs/en/annotators.md b/docs/en/annotators.md index 4526453a7ebc94..c5c21707b80f8e 100644 --- a/docs/en/annotators.md +++ b/docs/en/annotators.md @@ -45,6 +45,7 @@ There are two types of Annotators: {:.table-model-big} |Annotator|Description|Version | |---|---|---| +{% include templates/anno_table_entry.md path="" name="AutoGGUFEmbeddings" summary="Annotator that uses the llama.cpp library to generate text embeddings with large language models."%} {% include templates/anno_table_entry.md path="" name="AutoGGUFModel" summary="Annotator that uses the llama.cpp library to generate text completions with large language models."%} {% include templates/anno_table_entry.md path="" name="BGEEmbeddings" summary="Sentence embeddings using BGE."%} {% include templates/anno_table_entry.md path="" name="BigTextMatcher" summary="Annotator to match exact phrases (by token) provided in a file against a Document."%} diff --git a/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb b/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb index 9eb0f1884e8bb7..d4152e51194c25 100644 --- a/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb +++ b/examples/python/llama.cpp/PromptAssember_with_AutoGGUFModel.ipynb @@ -251,7 +251,7 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": "sparknlp_dev", "language": "python", "name": "python3" }, @@ -264,7 +264,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.10.12" } }, "nbformat": 4, diff --git a/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFEmbeddings.ipynb b/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFEmbeddings.ipynb new file mode 100644 index 00000000000000..2adfdad89625ec --- /dev/null +++ b/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFEmbeddings.ipynb @@ -0,0 +1,429 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFEmbeddings.ipynb)\n", + "\n", + "# llama.cpp ๐Ÿฆ™ embedding models in Spark NLP ๐Ÿš€\n", + "\n", + "Let's keep in mind a few things before we start ๐Ÿ˜Š\n", + "\n", + "- Support for llama.cpp embeddings was introduced in `Spark NLP 5.5.1`, enabling quantized LLM inference on a wide range of devices. Please make sure you have upgraded to the latest Spark NLP release.\n", + "- You need to use your own `.gguf` model files, which also include the models from the [Hugging Face Models](https://huggingface.co/models?library=gguf)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Download a GGUF Model\n", + "\n", + "Lets download a GGUF model to test it out. For this, we will use [nomic-ai/nomic-embed-text-v1.5-GGUF](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF). We can download the model by selecting the Q8_0 GGUF file from the \"Files and versions\" tab.\n", + "\n", + "Once downloaded, we can directly import this model into Spark NLP!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-11-02 13:42:45-- https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.Q8_0.gguf?download=true\n", + "Resolving huggingface.co (huggingface.co)... 3.160.39.87, 3.160.39.100, 3.160.39.99, ...\n", + "Connecting to huggingface.co (huggingface.co)|3.160.39.87|:443... connected.\n", + "HTTP request sent, awaiting response... 302 Found\n", + "Location: https://cdn-lfs-us-1.hf.co/repos/19/39/19396cd98fe8b02e39b1be815db29f6b251fee34fc5d6550db0b478083fdda2f/f7af6f66802f4df86eda10fe9bbcfc75c39562bed48ef6ace719a251cf1c2fdb?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27nomic-embed-text-v1.5.Q8_0.gguf%3B+filename%3D%22nomic-embed-text-v1.5.Q8_0.gguf%22%3B&Expires=1730810566&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMDgxMDU2Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzE5LzM5LzE5Mzk2Y2Q5OGZlOGIwMmUzOWIxYmU4MTVkYjI5ZjZiMjUxZmVlMzRmYzVkNjU1MGRiMGI0NzgwODNmZGRhMmYvZjdhZjZmNjY4MDJmNGRmODZlZGExMGZlOWJiY2ZjNzVjMzk1NjJiZWQ0OGVmNmFjZTcxOWEyNTFjZjFjMmZkYj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=R7WUe1icdziIE4kS%7EHMcsyiySLOHJpkJ9lM2nC6EtIwPj6V12RRpjpDIyyv0%7EY9h32v2HDomyyNO6-Ry5UeIm3UjyESR9In3kmTKAqZX2zzlslTQMXDQghmLpIEQGgmh5-5RHrFgYPNxKmVICyQL1Vz9IgFQRMfdug6RBTGgmmXfLgksa9IU7TdvZcqvOb68HCdmv1hEt2U5vH4A9MF81ohMBqrvTb9389jzrlP1tZtNFb5wjNdZDmr57XIsvQRZB0ZDUIsMT1nc5QehNpWpX4jMLBSnkj1-oL9XN7%7EhAXDbB1mTH9kbrD3UUNKRm4%7ER-gVhegqsfirdSFi66sP3bg__&Key-Pair-Id=K24J24Z295AEI9 [following]\n", + "--2024-11-02 13:42:46-- https://cdn-lfs-us-1.hf.co/repos/19/39/19396cd98fe8b02e39b1be815db29f6b251fee34fc5d6550db0b478083fdda2f/f7af6f66802f4df86eda10fe9bbcfc75c39562bed48ef6ace719a251cf1c2fdb?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27nomic-embed-text-v1.5.Q8_0.gguf%3B+filename%3D%22nomic-embed-text-v1.5.Q8_0.gguf%22%3B&Expires=1730810566&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMDgxMDU2Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzE5LzM5LzE5Mzk2Y2Q5OGZlOGIwMmUzOWIxYmU4MTVkYjI5ZjZiMjUxZmVlMzRmYzVkNjU1MGRiMGI0NzgwODNmZGRhMmYvZjdhZjZmNjY4MDJmNGRmODZlZGExMGZlOWJiY2ZjNzVjMzk1NjJiZWQ0OGVmNmFjZTcxOWEyNTFjZjFjMmZkYj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=R7WUe1icdziIE4kS%7EHMcsyiySLOHJpkJ9lM2nC6EtIwPj6V12RRpjpDIyyv0%7EY9h32v2HDomyyNO6-Ry5UeIm3UjyESR9In3kmTKAqZX2zzlslTQMXDQghmLpIEQGgmh5-5RHrFgYPNxKmVICyQL1Vz9IgFQRMfdug6RBTGgmmXfLgksa9IU7TdvZcqvOb68HCdmv1hEt2U5vH4A9MF81ohMBqrvTb9389jzrlP1tZtNFb5wjNdZDmr57XIsvQRZB0ZDUIsMT1nc5QehNpWpX4jMLBSnkj1-oL9XN7%7EhAXDbB1mTH9kbrD3UUNKRm4%7ER-gVhegqsfirdSFi66sP3bg__&Key-Pair-Id=K24J24Z295AEI9\n", + "Resolving cdn-lfs-us-1.hf.co (cdn-lfs-us-1.hf.co)... 18.66.2.2, 18.66.2.116, 18.66.2.98, ...\n", + "Connecting to cdn-lfs-us-1.hf.co (cdn-lfs-us-1.hf.co)|18.66.2.2|:443... connected.\n", + "HTTP request sent, awaiting response... 200 OK\n", + "Length: 274290560 (262M) [application/octet-stream]\n", + "Saving to: โ€˜nomic-embed-text-v1.5.Q8_0.ggufโ€™\n", + "\n", + "nomic-embed-text-v1 100%[===================>] 261.58M 23.8MB/s in 10s \n", + "\n", + "2024-11-02 13:42:56 (24.9 MB/s) - โ€˜nomic-embed-text-v1.5.Q8_0.ggufโ€™ saved [274290560/274290560]\n", + "\n" + ] + } + ], + "source": [ + "EXPORT_PATH = \"nomic-embed-text-v1.5.Q8_0.gguf\"\n", + "! wget \"https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/{EXPORT_PATH}?download=true\" -O {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import and Save AutGGUF models in Spark NLP\n", + "\n", + "- Let's install and setup Spark NLP (if running it Google Colab)\n", + "- This part is pretty easy via our simple script" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Only execute this if you are on Google Colab\n", + "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start Spark with Spark NLP included via our simple `start()` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sparknlp\n", + "\n", + "# let's start Spark with Spark NLP with GPU enabled. If you don't have GPUs available remove this parameter.\n", + "spark = sparknlp.start(gpu=True)\n", + "print(sparknlp.version())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's use the `loadSavedModel` functon in `AutoGGUFModel`\n", + "- Most params will be set automatically. They can also be set later after loading the model in `AutoGGUFModel` during runtime, so don't worry about setting them now.\n", + "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n", + "- We can set the model to embedding mode with `setEmbedding`. Afterwards the model will return the embeddings in the Annotations.\n", + "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "jsl-llama: Extracted 'libjllama.so' to '/tmp/libjllama.so'\n" + ] + } + ], + "source": [ + "from sparknlp.annotator import *\n", + "\n", + "# All these params should be identical to the original ONNX model\n", + "autoGGUFEmbeddings = (\n", + " AutoGGUFEmbeddings.loadSavedModel(EXPORT_PATH, spark)\n", + " .setInputCols(\"document\")\n", + " .setOutputCol(\"embeddings\")\n", + " .setBatchSize(4)\n", + " .setNGpuLayers(99)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Let's save it on disk so it is easier to be moved around and also be used later via `.load` function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "24/11/02 13:48:29 WARN TaskSetManager: Stage 0 contains a task of very large size (1073 KiB). The maximum recommended task size is 1000 KiB.\n" + ] + } + ], + "source": [ + "autoGGUFEmbeddings.write().overwrite().save(f\"nomic-embed-text-v1.5.Q8_0.gguf_spark_nlp\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's clean up stuff we don't need anymore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {EXPORT_PATH}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Awesome ๐Ÿ˜Ž !\n", + "\n", + "This is your GGUF model from loaded and saved by Spark NLP ๐Ÿš€" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "total 267872\n", + "drwxr-xr-x 2 root root 4096 Nov 2 13:48 metadata\n", + "-rwxrwxr-x 1 root root 274290560 Nov 2 13:48 nomic-embed-text-v1.5.Q8_0.gguf\n" + ] + } + ], + "source": [ + "! ls -l nomic-embed-text-v1.5.Q8_0.gguf_spark_nlp/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny GGUF model ๐Ÿ˜Š" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "24/11/02 13:48:57 WARN SparkContext: The path /home/root/Workspace/scala/spark-nlp/examples/python/llama.cpp/nomic-embed-text-v1.5.Q8_0.gguf_spark_nlp/nomic-embed-text-v1.5.Q8_0.gguf has been added already. Overwriting of added paths is not supported in the current version.\n", + "24/11/02 13:48:57 WARN DAGScheduler: Broadcasting large task binary with size 1028.0 KiB\n", + "24/11/02 13:48:57 WARN DAGScheduler: Broadcasting large task binary with size 1028.0 KiB\n", + "24/11/02 13:48:57 WARN DAGScheduler: Broadcasting large task binary with size 1028.0 KiB\n", + "llama_model_loader: loaded meta data with 22 key-value pairs and 112 tensors from /tmp/spark-6de50aee-1059-4698-98e2-db9d68663467/userFiles-932de0e7-9a8f-41f5-9aaf-94bb7406df74/nomic-embed-text-v1.5.Q8_0.gguf (version GGUF V3 (latest))\n", + "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", + "llama_model_loader: - kv 0: general.architecture str = nomic-bert\n", + "llama_model_loader: - kv 1: general.name str = nomic-embed-text-v1.5\n", + "llama_model_loader: - kv 2: nomic-bert.block_count u32 = 12\n", + "llama_model_loader: - kv 3: nomic-bert.context_length u32 = 2048\n", + "llama_model_loader: - kv 4: nomic-bert.embedding_length u32 = 768\n", + "llama_model_loader: - kv 5: nomic-bert.feed_forward_length u32 = 3072\n", + "llama_model_loader: - kv 6: nomic-bert.attention.head_count u32 = 12\n", + "llama_model_loader: - kv 7: nomic-bert.attention.layer_norm_epsilon f32 = 0.000000\n", + "llama_model_loader: - kv 8: general.file_type u32 = 1\n", + "llama_model_loader: - kv 9: nomic-bert.attention.causal bool = false\n", + "llama_model_loader: - kv 10: nomic-bert.pooling_type u32 = 1\n", + "llama_model_loader: - kv 11: nomic-bert.rope.freq_base f32 = 1000.000000\n", + "llama_model_loader: - kv 12: tokenizer.ggml.token_type_count u32 = 2\n", + "llama_model_loader: - kv 13: tokenizer.ggml.bos_token_id u32 = 101\n", + "llama_model_loader: - kv 14: tokenizer.ggml.eos_token_id u32 = 102\n", + "llama_model_loader: - kv 15: tokenizer.ggml.model str = bert\n", + "llama_model_loader: - kv 16: tokenizer.ggml.tokens arr[str,30522] = [\"[PAD]\", \"[unused0]\", \"[unused1]\", \"...\n", + "llama_model_loader: - kv 17: tokenizer.ggml.scores arr[f32,30522] = [-1000.000000, -1000.000000, -1000.00...\n", + "llama_model_loader: - kv 18: tokenizer.ggml.token_type arr[i32,30522] = [3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\n", + "llama_model_loader: - kv 19: tokenizer.ggml.unknown_token_id u32 = 100\n", + "llama_model_loader: - kv 20: tokenizer.ggml.seperator_token_id u32 = 102\n", + "llama_model_loader: - kv 21: tokenizer.ggml.padding_token_id u32 = 0\n", + "llama_model_loader: - type f32: 51 tensors\n", + "llama_model_loader: - type f16: 61 tensors\n", + "llm_load_vocab: special tokens cache size = 5\n", + "llm_load_vocab: token to piece cache size = 0.2032 MB\n", + "llm_load_print_meta: format = GGUF V3 (latest)\n", + "llm_load_print_meta: arch = nomic-bert\n", + "llm_load_print_meta: vocab type = WPM\n", + "llm_load_print_meta: n_vocab = 30522\n", + "llm_load_print_meta: n_merges = 0\n", + "llm_load_print_meta: vocab_only = 0\n", + "llm_load_print_meta: n_ctx_train = 2048\n", + "llm_load_print_meta: n_embd = 768\n", + "llm_load_print_meta: n_layer = 12\n", + "llm_load_print_meta: n_head = 12\n", + "llm_load_print_meta: n_head_kv = 12\n", + "llm_load_print_meta: n_rot = 64\n", + "llm_load_print_meta: n_swa = 0\n", + "llm_load_print_meta: n_embd_head_k = 64\n", + "llm_load_print_meta: n_embd_head_v = 64\n", + "llm_load_print_meta: n_gqa = 1\n", + "llm_load_print_meta: n_embd_k_gqa = 768\n", + "llm_load_print_meta: n_embd_v_gqa = 768\n", + "llm_load_print_meta: f_norm_eps = 1.0e-12\n", + "llm_load_print_meta: f_norm_rms_eps = 0.0e+00\n", + "llm_load_print_meta: f_clamp_kqv = 0.0e+00\n", + "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n", + "llm_load_print_meta: f_logit_scale = 0.0e+00\n", + "llm_load_print_meta: n_ff = 3072\n", + "llm_load_print_meta: n_expert = 0\n", + "llm_load_print_meta: n_expert_used = 0\n", + "llm_load_print_meta: causal attn = 0\n", + "llm_load_print_meta: pooling type = 1\n", + "llm_load_print_meta: rope type = 2\n", + "llm_load_print_meta: rope scaling = linear\n", + "llm_load_print_meta: freq_base_train = 1000.0\n", + "llm_load_print_meta: freq_scale_train = 1\n", + "llm_load_print_meta: n_ctx_orig_yarn = 2048\n", + "llm_load_print_meta: rope_finetuned = unknown\n", + "llm_load_print_meta: ssm_d_conv = 0\n", + "llm_load_print_meta: ssm_d_inner = 0\n", + "llm_load_print_meta: ssm_d_state = 0\n", + "llm_load_print_meta: ssm_dt_rank = 0\n", + "llm_load_print_meta: model type = 137M\n", + "llm_load_print_meta: model ftype = F16\n", + "llm_load_print_meta: model params = 136.73 M\n", + "llm_load_print_meta: model size = 260.86 MiB (16.00 BPW) \n", + "llm_load_print_meta: general.name = nomic-embed-text-v1.5\n", + "llm_load_print_meta: BOS token = 101 '[CLS]'\n", + "llm_load_print_meta: EOS token = 102 '[SEP]'\n", + "llm_load_print_meta: UNK token = 100 '[UNK]'\n", + "llm_load_print_meta: SEP token = 102 '[SEP]'\n", + "llm_load_print_meta: PAD token = 0 '[PAD]'\n", + "llm_load_print_meta: CLS token = 101 '[CLS]'\n", + "llm_load_print_meta: MASK token = 103 '[MASK]'\n", + "llm_load_print_meta: LF token = 0 '[PAD]'\n", + "llm_load_print_meta: max token length = 21\n", + "llm_load_tensors: ggml ctx size = 0.05 MiB\n", + "llm_load_tensors: CPU buffer size = 260.86 MiB\n", + ".......................................................\n", + "llama_new_context_with_model: n_ctx = 4096\n", + "llama_new_context_with_model: n_batch = 512\n", + "llama_new_context_with_model: n_ubatch = 512\n", + "llama_new_context_with_model: flash_attn = 0\n", + "llama_new_context_with_model: freq_base = 1000.0\n", + "llama_new_context_with_model: freq_scale = 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[WARN] Not compiled with GPU offload support, --n-gpu-layers option will be ignored. See main README.md for information on enabling GPU BLAS support n_gpu_layers=-1\n", + "[INFO] build info build=3534 commit=\"641f5dd2\"\n", + "[INFO] system info n_threads=6 n_threads_batch=-1 total_threads=6 system_info=\"AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | \"\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_kv_cache_init: CPU KV buffer size = 144.00 MiB\n", + "llama_new_context_with_model: KV self size = 144.00 MiB, K (f16): 72.00 MiB, V (f16): 72.00 MiB\n", + "llama_new_context_with_model: CPU output buffer size = 0.00 MiB\n", + "ggml_gallocr_reserve_n: reallocating CPU buffer from size 0.00 MiB to 23.00 MiB\n", + "llama_new_context_with_model: CPU compute buffer size = 23.00 MiB\n", + "llama_new_context_with_model: graph nodes = 453\n", + "llama_new_context_with_model: graph splits = 1\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] initializing slots n_slots=4\n", + "[INFO] new slot id_slot=0 n_ctx_slot=1024\n", + "[INFO] new slot id_slot=1 n_ctx_slot=1024\n", + "[INFO] new slot id_slot=2 n_ctx_slot=1024\n", + "[INFO] new slot id_slot=3 n_ctx_slot=1024\n", + "[INFO] model loaded\n", + "[INFO] chat template chat_example=\"<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n<|im_start|>user\\nHello<|im_end|>\\n<|im_start|>assistant\\nHi there<|im_end|>\\n<|im_start|>user\\nHow are you?<|im_end|>\\n<|im_start|>assistant\\n\" built_in=true\n", + "[INFO] slot is processing task id_slot=0 id_task=0\n", + "[INFO] kv cache rm [p0, end) id_slot=0 id_task=0 p0=0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 12:> (0 + 1) / 1]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[INFO] slot released id_slot=0 id_task=0 n_ctx=4096 n_past=7 n_system_tokens=0 n_cache_tokens=0 truncated=false\n", + "[INFO] all slots are idle\n", + "+--------------------------------------------------------------------------------+\n", + "| embeddings|\n", + "+--------------------------------------------------------------------------------+\n", + "|[[0.046383496, 0.02353651, -0.12484242, -0.009759982, 0.05522549, -0.01701891...|\n", + "+--------------------------------------------------------------------------------+\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "import sparknlp\n", + "from sparknlp.base import *\n", + "from sparknlp.annotator import *\n", + "from pyspark.ml import Pipeline\n", + "\n", + "document_assembler = DocumentAssembler().setInputCol(\"text\").setOutputCol(\"document\")\n", + "\n", + "autoGGUFEmbeddings = AutoGGUFEmbeddings.load(\"nomic-embed-text-v1.5.Q8_0.gguf_spark_nlp\")\n", + "\n", + "pipeline = Pipeline().setStages([document_assembler, autoGGUFEmbeddings])\n", + "\n", + "data = spark.createDataFrame([[\"This is a sentence.\"]]).toDF(\"text\")\n", + "\n", + "result = pipeline.fit(data).transform(data)\n", + "result.select(\"embeddings.embeddings\").show(1, 80)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's it! You can now go wild and use hundreds of GGUF models from HuggingFace ๐Ÿค— in Spark NLP ๐Ÿš€\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/python/sparknlp/annotator/embeddings/__init__.py b/python/sparknlp/annotator/embeddings/__init__.py index be622971684f8b..da453d2c555037 100644 --- a/python/sparknlp/annotator/embeddings/__init__.py +++ b/python/sparknlp/annotator/embeddings/__init__.py @@ -40,3 +40,4 @@ from sparknlp.annotator.embeddings.mxbai_embeddings import * from sparknlp.annotator.embeddings.snowflake_embeddings import * from sparknlp.annotator.embeddings.nomic_embeddings import * +from sparknlp.annotator.embeddings.auto_gguf_embeddings import * diff --git a/python/sparknlp/annotator/embeddings/auto_gguf_embeddings.py b/python/sparknlp/annotator/embeddings/auto_gguf_embeddings.py new file mode 100755 index 00000000000000..30cee663c16129 --- /dev/null +++ b/python/sparknlp/annotator/embeddings/auto_gguf_embeddings.py @@ -0,0 +1,538 @@ +# Copyright 2017-2023 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains classes for the AutoGGUFEmbeddings.""" +from typing import List + +from sparknlp.common import * + + +class AutoGGUFEmbeddings(AnnotatorModel, HasBatchedAnnotate): + """ + Annotator that uses the llama.cpp library to generate text embeddings with large language + models + + The type of embedding pooling can be set with the `setPoolingType` method. The default is + `"MEAN"`. The available options are `"NONE"`, `"MEAN"`, `"CLS"`, and `"LAST"`. + + Pretrained models can be loaded with :meth:`.pretrained` of the companion + object: + + >>> auto_gguf_model = AutoGGUFEmbeddings.pretrained() \\ + ... .setInputCols(["document"]) \\ + ... .setOutputCol("embeddings") + + The default model is ``"nomic-embed-text-v1.5.Q8_0.gguf"``, if no name is provided. + + For extended examples of usage, see the + `AutoGGUFEmbeddingsTest `__ + and the + `example notebook `__. + + For available pretrained models please see the `Models Hub `__. + + ====================== ====================== + Input Annotation types Output Annotation type + ====================== ====================== + ``DOCUMENT`` ``SENTENCE_EMBEDDINGS`` + ====================== ====================== + + Parameters + ---------- + nThreads + Set the number of threads to use during generation + nThreadsBatch + Set the number of threads to use during batch and prompt processing + nCtx + Set the size of the prompt context + nBatch + Set the logical batch size for prompt processing (must be >=32 to use BLAS) + nUbatch + Set the physical batch size for prompt processing (must be >=32 to use BLAS) + nChunks + Set the maximal number of chunks to process + nSequences + Set the number of sequences to decode + nGpuLayers + Set the number of layers to store in VRAM (-1 - use default) + gpuSplitMode + Set how to split the model across GPUs + mainGpu + Set the main GPU that is used for scratch and small tensors. + tensorSplit + Set how split tensors should be distributed across GPUs + grpAttnN + Set the group-attention factor + grpAttnW + Set the group-attention width + ropeFreqBase + Set the RoPE base frequency, used by NTK-aware scaling + ropeFreqScale + Set the RoPE frequency scaling factor, expands context by a factor of 1/N + yarnExtFactor + Set the YaRN extrapolation mix factor + yarnAttnFactor + Set the YaRN scale sqrt(t) or attention magnitude + yarnBetaFast + Set the YaRN low correction dim or beta + yarnBetaSlow + Set the YaRN high correction dim or alpha + yarnOrigCtx + Set the YaRN original context size of model + defragmentationThreshold + Set the KV cache defragmentation threshold + numaStrategy + Set optimization strategies that help on some NUMA systems (if available) + ropeScalingType + Set the RoPE frequency scaling method, defaults to linear unless specified by the model + poolingType + Set the pooling type for embeddings, use model default if unspecified + flashAttention + Whether to enable Flash Attention + useMmap + Whether to use memory-map model (faster load but may increase pageouts if not using mlock) + useMlock + Whether to force the system to keep model in RAM rather than swapping or compressing + noKvOffload + Whether to disable KV offload + + Notes + ----- + To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set + the number of GPU layers with the `setNGpuLayers` method. + + When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers` + according to your hardware to avoid out-of-memory errors. + + Examples + -------- + >>> import sparknlp + >>> from sparknlp.base import * + >>> from sparknlp.annotator import * + >>> from pyspark.ml import Pipeline + >>> document = DocumentAssembler() \\ + ... .setInputCol("text") \\ + ... .setOutputCol("document") + >>> autoGGUFEmbeddings = AutoGGUFEmbeddings.pretrained() \\ + ... .setInputCols(["document"]) \\ + ... .setOutputCol("embeddings") \\ + ... .setBatchSize(4) \\ + ... .setNGpuLayers(99) \\ + ... .setPoolingType("MEAN") + >>> pipeline = Pipeline().setStages([document, autoGGUFEmbeddings]) + >>> data = spark.createDataFrame([["The moons of Jupiter are 77 in total, with 79 confirmed natural satellites and 2 man-made ones."]]).toDF("text") + >>> result = pipeline.fit(data).transform(data) + >>> result.select("embeddings.embeddings").show(truncate = False) + +--------------------------------------------------------------------------------+ + | embeddings| + +--------------------------------------------------------------------------------+ + |[[-0.034486726, 0.07770534, -0.15982522, -0.017873349, 0.013914132, 0.0365736...| + +--------------------------------------------------------------------------------+ + """ + + name = "AutoGGUFEmbeddings" + inputAnnotatorTypes = [AnnotatorType.DOCUMENT] + outputAnnotatorType = AnnotatorType.DOCUMENT + + # -------- MODEl PARAMETERS -------- + nThreads = Param( + Params._dummy(), + "nThreads", + "Set the number of threads to use during generation", + typeConverter=TypeConverters.toInt, + ) + nThreadsBatch = Param( + Params._dummy(), + "nThreadsBatch", + "Set the number of threads to use during batch and prompt processing", + typeConverter=TypeConverters.toInt, + ) + nCtx = Param( + Params._dummy(), + "nCtx", + "Set the size of the prompt context", + typeConverter=TypeConverters.toInt, + ) + nBatch = Param( + Params._dummy(), + "nBatch", + "Set the logical batch size for prompt processing (must be >=32 to use BLAS)", + typeConverter=TypeConverters.toInt, + ) + nUbatch = Param( + Params._dummy(), + "nUbatch", + "Set the physical batch size for prompt processing (must be >=32 to use BLAS)", + typeConverter=TypeConverters.toInt, + ) + nChunks = Param( + Params._dummy(), + "nChunks", + "Set the maximal number of chunks to process", + typeConverter=TypeConverters.toInt, + ) + nSequences = Param( + Params._dummy(), + "nSequences", + "Set the number of sequences to decode", + typeConverter=TypeConverters.toInt, + ) + nGpuLayers = Param( + Params._dummy(), + "nGpuLayers", + "Set the number of layers to store in VRAM (-1 - use default)", + typeConverter=TypeConverters.toInt, + ) + # Set how to split the model across GPUs + # + # - NONE: No GPU split + # - LAYER: Split the model across GPUs by layer + # - ROW: Split the model across GPUs by rows + gpuSplitMode = Param( + Params._dummy(), + "gpuSplitMode", + "Set how to split the model across GPUs", + typeConverter=TypeConverters.toString, + ) + mainGpu = Param( + Params._dummy(), + "mainGpu", + "Set the main GPU that is used for scratch and small tensors.", + typeConverter=TypeConverters.toInt, + ) + tensorSplit = Param( + Params._dummy(), + "tensorSplit", + "Set how split tensors should be distributed across GPUs", + typeConverter=TypeConverters.toListFloat, + ) + grpAttnN = Param( + Params._dummy(), + "grpAttnN", + "Set the group-attention factor", + typeConverter=TypeConverters.toInt, + ) + grpAttnW = Param( + Params._dummy(), + "grpAttnW", + "Set the group-attention width", + typeConverter=TypeConverters.toInt, + ) + ropeFreqBase = Param( + Params._dummy(), + "ropeFreqBase", + "Set the RoPE base frequency, used by NTK-aware scaling", + typeConverter=TypeConverters.toFloat, + ) + ropeFreqScale = Param( + Params._dummy(), + "ropeFreqScale", + "Set the RoPE frequency scaling factor, expands context by a factor of 1/N", + typeConverter=TypeConverters.toFloat, + ) + yarnExtFactor = Param( + Params._dummy(), + "yarnExtFactor", + "Set the YaRN extrapolation mix factor", + typeConverter=TypeConverters.toFloat, + ) + yarnAttnFactor = Param( + Params._dummy(), + "yarnAttnFactor", + "Set the YaRN scale sqrt(t) or attention magnitude", + typeConverter=TypeConverters.toFloat, + ) + yarnBetaFast = Param( + Params._dummy(), + "yarnBetaFast", + "Set the YaRN low correction dim or beta", + typeConverter=TypeConverters.toFloat, + ) + yarnBetaSlow = Param( + Params._dummy(), + "yarnBetaSlow", + "Set the YaRN high correction dim or alpha", + typeConverter=TypeConverters.toFloat, + ) + yarnOrigCtx = Param( + Params._dummy(), + "yarnOrigCtx", + "Set the YaRN original context size of model", + typeConverter=TypeConverters.toInt, + ) + defragmentationThreshold = Param( + Params._dummy(), + "defragmentationThreshold", + "Set the KV cache defragmentation threshold", + typeConverter=TypeConverters.toFloat, + ) + # Set optimization strategies that help on some NUMA systems (if available) + # + # Available Strategies: + # + # - DISABLED: No NUMA optimizations + # - DISTRIBUTE: Spread execution evenly over all + # - ISOLATE: Only spawn threads on CPUs on the node that execution started on + # - NUMA_CTL: Use the CPU map provided by numactl + # - MIRROR: Mirrors the model across NUMA nodes + numaStrategy = Param( + Params._dummy(), + "numaStrategy", + "Set optimization strategies that help on some NUMA systems (if available)", + typeConverter=TypeConverters.toString, + ) + # Set the RoPE frequency scaling method, defaults to linear unless specified by the model. + # + # - UNSPECIFIED: Don't use any scaling + # - LINEAR: Linear scaling + # - YARN: YaRN RoPE scaling + ropeScalingType = Param( + Params._dummy(), + "ropeScalingType", + "Set the RoPE frequency scaling method, defaults to linear unless specified by the model", + typeConverter=TypeConverters.toString, + ) + # Set the pooling type for embeddings, use model default if unspecified + # + # - 0 UNSPECIFIED: Don't use any pooling + # - 1 MEAN: Mean Pooling + # - 2 CLS: CLS Pooling + poolingType = Param( + Params._dummy(), + "poolingType", + "Set the pooling type for embeddings, use model default if unspecified", + typeConverter=TypeConverters.toString, + ) + embedding = Param( + Params._dummy(), + "embedding", + "Whether to load model with embedding support", + typeConverter=TypeConverters.toBoolean, + ) + flashAttention = Param( + Params._dummy(), + "flashAttention", + "Whether to enable Flash Attention", + typeConverter=TypeConverters.toBoolean, + ) + useMmap = Param( + Params._dummy(), + "useMmap", + "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)", + typeConverter=TypeConverters.toBoolean, + ) + useMlock = Param( + Params._dummy(), + "useMlock", + "Whether to force the system to keep model in RAM rather than swapping or compressing", + typeConverter=TypeConverters.toBoolean, + ) + noKvOffload = Param( + Params._dummy(), + "noKvOffload", + "Whether to disable KV offload", + typeConverter=TypeConverters.toBoolean, + ) + + # -------- MODEL SETTERS -------- + def setNThreads(self, nThreads: int): + """Set the number of threads to use during generation""" + return self._set(nThreads=nThreads) + + def setNThreadsBatch(self, nThreadsBatch: int): + """Set the number of threads to use during batch and prompt processing""" + return self._set(nThreadsBatch=nThreadsBatch) + + def setNCtx(self, nCtx: int): + """Set the size of the prompt context""" + return self._set(nCtx=nCtx) + + def setNBatch(self, nBatch: int): + """Set the logical batch size for prompt processing (must be >=32 to use BLAS)""" + return self._set(nBatch=nBatch) + + def setNUbatch(self, nUbatch: int): + """Set the physical batch size for prompt processing (must be >=32 to use BLAS)""" + return self._set(nUbatch=nUbatch) + + def setNChunks(self, nChunks: int): + """Set the maximal number of chunks to process""" + return self._set(nChunks=nChunks) + + def setNSequences(self, nSequences: int): + """Set the number of sequences to decode""" + return self._set(nSequences=nSequences) + + def setNGpuLayers(self, nGpuLayers: int): + """Set the number of layers to store in VRAM (-1 - use default)""" + return self._set(nGpuLayers=nGpuLayers) + + def setGpuSplitMode(self, gpuSplitMode: str): + """Set how to split the model across GPUs""" + return self._set(gpuSplitMode=gpuSplitMode) + + def setMainGpu(self, mainGpu: int): + """Set the main GPU that is used for scratch and small tensors.""" + return self._set(mainGpu=mainGpu) + + def setTensorSplit(self, tensorSplit: List[float]): + """Set how split tensors should be distributed across GPUs""" + return self._set(tensorSplit=tensorSplit) + + def setGrpAttnN(self, grpAttnN: int): + """Set the group-attention factor""" + return self._set(grpAttnN=grpAttnN) + + def setGrpAttnW(self, grpAttnW: int): + """Set the group-attention width""" + return self._set(grpAttnW=grpAttnW) + + def setRopeFreqBase(self, ropeFreqBase: float): + """Set the RoPE base frequency, used by NTK-aware scaling""" + return self._set(ropeFreqBase=ropeFreqBase) + + def setRopeFreqScale(self, ropeFreqScale: float): + """Set the RoPE frequency scaling factor, expands context by a factor of 1/N""" + return self._set(ropeFreqScale=ropeFreqScale) + + def setYarnExtFactor(self, yarnExtFactor: float): + """Set the YaRN extrapolation mix factor""" + return self._set(yarnExtFactor=yarnExtFactor) + + def setYarnAttnFactor(self, yarnAttnFactor: float): + """Set the YaRN scale sqrt(t) or attention magnitude""" + return self._set(yarnAttnFactor=yarnAttnFactor) + + def setYarnBetaFast(self, yarnBetaFast: float): + """Set the YaRN low correction dim or beta""" + return self._set(yarnBetaFast=yarnBetaFast) + + def setYarnBetaSlow(self, yarnBetaSlow: float): + """Set the YaRN high correction dim or alpha""" + return self._set(yarnBetaSlow=yarnBetaSlow) + + def setYarnOrigCtx(self, yarnOrigCtx: int): + """Set the YaRN original context size of model""" + return self._set(yarnOrigCtx=yarnOrigCtx) + + def setDefragmentationThreshold(self, defragmentationThreshold: float): + """Set the KV cache defragmentation threshold""" + return self._set(defragmentationThreshold=defragmentationThreshold) + + def setNumaStrategy(self, numaStrategy: str): + """Set optimization strategies that help on some NUMA systems (if available)""" + numaUpper = numaStrategy.upper() + numaStrategies = ["DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR"] + if numaUpper not in numaStrategies: + raise ValueError( + f"Invalid NUMA strategy: {numaUpper}. " + + f"Valid values are: {numaStrategies}" + ) + return self._set(numaStrategy=numaStrategy) + + def setRopeScalingType(self, ropeScalingType: str): + """Set the RoPE frequency scaling method, defaults to linear unless specified by the model""" + return self._set(ropeScalingType=ropeScalingType) + + def setPoolingType(self, poolingType: str): + """Set the pooling type for embeddings, use model default if unspecified""" + poolingTypeUpper = poolingType.upper() + poolingTypes = ["NONE", "MEAN", "CLS", "LAST"] + if poolingTypeUpper not in poolingTypes: + raise ValueError( + f"Invalid pooling type: {poolingType}. " + + f"Valid values are: {poolingTypes}" + ) + return self._set(poolingType=poolingType) + + def setFlashAttention(self, flashAttention: bool): + """Whether to enable Flash Attention""" + return self._set(flashAttention=flashAttention) + + def setUseMmap(self, useMmap: bool): + """Whether to use memory-map model (faster load but may increase pageouts if not using mlock)""" + return self._set(useMmap=useMmap) + + def setUseMlock(self, useMlock: bool): + """Whether to force the system to keep model in RAM rather than swapping or compressing""" + return self._set(useMlock=useMlock) + + def setNoKvOffload(self, noKvOffload: bool): + """Whether to disable KV offload""" + return self._set(noKvOffload=noKvOffload) + + def getMetadata(self): + """Gets the metadata of the model""" + return self._call_java("getMetadata") + + @keyword_only + def __init__( + self, + classname="com.johnsnowlabs.nlp.embeddings.AutoGGUFEmbeddings", + java_model=None, + ): + super(AutoGGUFEmbeddings, self).__init__( + classname=classname, java_model=java_model + ) + self._setDefault( + embedding=True, + nCtx=4096, + nBatch=512, + poolingType="MEAN", + ) + + @staticmethod + def loadSavedModel(folder, spark_session): + """Loads a locally saved model. + + Parameters + ---------- + folder : str + Folder of the saved model + spark_session : pyspark.sql.SparkSession + The current SparkSession + + Returns + ------- + AutoGGUFEmbeddings + The restored model + """ + from sparknlp.internal import _AutoGGUFEmbeddingsLoader + + jModel = _AutoGGUFEmbeddingsLoader(folder, spark_session._jsparkSession)._java_obj + return AutoGGUFEmbeddings(java_model=jModel) + + @staticmethod + def pretrained(name="nomic-embed-text-v1.5.Q8_0.gguf", lang="en", remote_loc=None): + """Downloads and loads a pretrained model. + + Parameters + ---------- + name : str, optional + Name of the pretrained model, by default "nomic-embed-text-v1.5.Q8_0.gguf" + lang : str, optional + Language of the pretrained model, by default "en" + remote_loc : str, optional + Optional remote address of the resource, by default None. Will use + Spark NLPs repositories otherwise. + + Returns + ------- + AutoGGUFEmbeddings + The restored model + """ + from sparknlp.pretrained import ResourceDownloader + + return ResourceDownloader.downloadModel( + AutoGGUFEmbeddings, name, lang, remote_loc + ) diff --git a/python/sparknlp/annotator/seq2seq/auto_gguf_model.py b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py index 37af88d7dbbe15..d28ac006c9da22 100755 --- a/python/sparknlp/annotator/seq2seq/auto_gguf_model.py +++ b/python/sparknlp/annotator/seq2seq/auto_gguf_model.py @@ -199,7 +199,6 @@ class AutoGGUFModel(AnnotatorModel, HasBatchedAnnotate): useChatTemplate Set whether or not generate should apply a chat template - Notes ----- To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set @@ -208,29 +207,6 @@ class AutoGGUFModel(AnnotatorModel, HasBatchedAnnotate): When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers` according to your hardware to avoid out-of-memory errors. - References - ---------- - - `Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension - `__ - - https://github.com/pytorch/fairseq - - **Paper Abstract:** - *We present BART, a denoising autoencoder for pretraining sequence-to-sequence models. - BART is trained by (1) corrupting text with an arbitrary noising function, and (2) - learning a model to reconstruct the original text. It uses a standard Tranformer-based - neural machine translation architecture which, despite its simplicity, can be seen as - generalizing BERT (due to the bidirectional encoder), GPT (with the left-to-right decoder), - and many other more recent pretraining schemes. We evaluate a number of noising approaches, - finding the best performance by both randomly shuffling the order of the original sentences - and using a novel in-filling scheme, where spans of text are replaced with a single mask token. - BART is particularly effective when fine tuned for text generation but also works well for - comprehension tasks. It matches the performance of RoBERTa with comparable training resources - on GLUE and SQuAD, achieves new state-of-the-art results on a range of abstractive dialogue, - question answering, and summarization tasks, with gains of up to 6 ROUGE. BART also provides - a 1.1 BLEU increase over a back-translation system for machine translation, with only target - language pretraining. We also report ablation experiments that replicate other pretraining - schemes within the BART framework, to better measure which factors most influence end-task performance.* - Examples -------- >>> import sparknlp @@ -553,6 +529,13 @@ def setDefragmentationThreshold(self, defragmentationThreshold: float): def setNumaStrategy(self, numaStrategy: str): """Set optimization strategies that help on some NUMA systems (if available)""" + numaUpper = numaStrategy.upper() + numaStrategies = ["DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR"] + if numaUpper not in numaStrategies: + raise ValueError( + f"Invalid NUMA strategy: {numaUpper}. " + + f"Valid values are: {numaStrategies}" + ) return self._set(numaStrategy=numaStrategy) def setRopeScalingType(self, ropeScalingType: str): @@ -561,6 +544,13 @@ def setRopeScalingType(self, ropeScalingType: str): def setPoolingType(self, poolingType: bool): """Set the pooling type for embeddings, use model default if unspecified""" + poolingTypeUpper = poolingType.upper() + poolingTypes = ["NONE", "MEAN", "CLS", "LAST"] + if poolingTypeUpper not in poolingTypes: + raise ValueError( + f"Invalid pooling type: {poolingType}. " + + f"Valid values are: {poolingTypes}" + ) return self._set(poolingType=poolingType) def setModelDraft(self, modelDraft: str): diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py index eec3544dc41c6f..4cb5321e8a8691 100644 --- a/python/sparknlp/internal/__init__.py +++ b/python/sparknlp/internal/__init__.py @@ -1007,10 +1007,17 @@ def __init__(self, path, jspark): "com.johnsnowlabs.nlp.embeddings.SnowFlakeEmbeddings.loadSavedModel", path, jspark ) + +class _AutoGGUFEmbeddingsLoader(ExtendedJavaWrapper): + def __init__(self, path, jspark): + super(_AutoGGUFEmbeddingsLoader, self).__init__( + "com.johnsnowlabs.nlp.embeddings.AutoGGUFEmbeddings.loadSavedModel", path, jspark) + + class _BLIPForQuestionAnswering(ExtendedJavaWrapper): def __init__(self, path, jspark): super(_BLIPForQuestionAnswering, self).__init__( "com.johnsnowlabs.nlp.annotators.cv.BLIPForQuestionAnswering.loadSavedModel", path, jspark, - ) \ No newline at end of file + ) diff --git a/python/test/annotator/embeddings/auto_gguf_embeddings_test.py b/python/test/annotator/embeddings/auto_gguf_embeddings_test.py new file mode 100644 index 00000000000000..72b82c19b6e830 --- /dev/null +++ b/python/test/annotator/embeddings/auto_gguf_embeddings_test.py @@ -0,0 +1,106 @@ +# Copyright 2017-2023 John Snow Labs +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import pytest + +from sparknlp.annotator import * +from sparknlp.base import * +from test.util import SparkContextForTest + + +@pytest.mark.slow +class AutoGGUFModelTestSpec(unittest.TestCase): + def setUp(self): + self.spark = SparkContextForTest.spark + self.data = ( + self.spark.createDataFrame( + [ + ["The moons of Jupiter are "], + ["Earth is "], + ["The moon is "], + ["The sun is "], + ] + ) + .toDF("text") + .repartition(1) + ) + self.document_assembler = ( + DocumentAssembler().setInputCol("text").setOutputCol("document") + ) + + def runTest(self): + model = ( + AutoGGUFEmbeddings.pretrained() + .setInputCols("document") + .setOutputCol("embeddings") + .setBatchSize(4) + .setNGpuLayers(99) + ) + + pipeline = Pipeline().setStages([self.document_assembler, model]) + results = pipeline.fit(self.data).transform(self.data) + collected = results.select("embeddings.embeddings").collect() + + for row in collected: + embds = row["embeddings"][0] + assert embds is not None + assert ( + sum(embds) > 0 + ), "Embeddings should not be zero. Was there an error on llama.cpp side?" + + +@pytest.mark.slow +class AutoGGUFEmbeddingsPoolingTypeTestSpec(unittest.TestCase): + def setUp(self): + self.spark = SparkContextForTest.spark + self.data = ( + self.spark.createDataFrame( + [ + ["The moons of Jupiter are "], + ["Earth is "], + ["The moon is "], + ["The sun is "], + ] + ) + .toDF("text") + .repartition(1) + ) + self.document_assembler = ( + DocumentAssembler().setInputCol("text").setOutputCol("document") + ) + + def runTest(self): + model = ( + # AutoGGUFEmbeddings.pretrained() + AutoGGUFEmbeddings.loadSavedModel( + "models/nomic-embed-text-v1.5.Q8_0.gguf", SparkContextForTest.spark + ) + .setInputCols("document") + .setOutputCol("embeddings") + .setBatchSize(4) + .setNGpuLayers(99) + .setPoolingType("CLS") + ) + + pipeline = Pipeline().setStages([self.document_assembler, model]) + results = pipeline.fit(self.data).transform(self.data) + collected = results.select("embeddings.embeddings").collect() + + for row in collected: + embds = row["embeddings"][0] + assert embds is not None + assert ( + sum(embds) > 0 + ), "Embeddings should not be zero. Was there an error on llama.cpp side?" diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala new file mode 100644 index 00000000000000..e200610b38a2a9 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppInferenceProperties.scala @@ -0,0 +1,572 @@ +package com.johnsnowlabs.nlp + +import com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel +import com.johnsnowlabs.nlp.llama.InferenceParameters +import com.johnsnowlabs.nlp.llama.args._ +import com.johnsnowlabs.nlp.serialization.StructFeature +import org.apache.spark.ml.param._ + +import scala.collection.mutable +import scala.jdk.CollectionConverters._ + +/** Contains settable inference parameters for the [[AutoGGUFModel]]. + * + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupprio setParam 1 + * @groupprio getParam 2 + * @groupprio param 3 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ +trait HasLlamaCppInferenceProperties { + this: ParamsAndFeaturesWritable with HasProtectedParams => + + /** @group param */ + val inputPrefix = + new Param[String](this, "inputPrefix", "Set the prompt to start generation with") + + /** @group param */ + val inputSuffix = + new Param[String](this, "inputSuffix", "Set a suffix for infilling") + + /** @group param */ + val cachePrompt = new BooleanParam( + this, + "cachePrompt", + "Whether to remember the prompt to avoid reprocessing it") + + /** @group param */ + val nPredict = new IntParam(this, "nPredict", "Set the number of tokens to predict") + + /** @group param */ + val topK = new IntParam(this, "topK", "Set top-k sampling") + + /** @group param */ + val topP = new FloatParam(this, "topP", "Set top-p sampling") + + /** @group param */ + val minP = new FloatParam(this, "minP", "Set min-p sampling") + + /** @group param */ + val tfsZ = new FloatParam(this, "tfsZ", "Set tail free sampling, parameter z") + + /** @group param */ + val typicalP = new FloatParam(this, "typicalP", "Set locally typical sampling, parameter p") + + /** @group param */ + val temperature = new FloatParam(this, "temperature", "Set the temperature") + + /** @group param */ + val dynamicTemperatureRange = + new FloatParam(this, "dynatempRange", "Set the dynamic temperature range") + + /** @group param */ + val dynamicTemperatureExponent = + new FloatParam(this, "dynatempExponent", "Set the dynamic temperature exponent") + + /** @group param */ + val repeatLastN = + new IntParam(this, "repeatLastN", "Set the last n tokens to consider for penalties") + + /** @group param */ + val repeatPenalty = + new FloatParam(this, "repeatPenalty", "Set the penalty of repeated sequences of tokens") + + /** @group param */ + val frequencyPenalty = + new FloatParam(this, "frequencyPenalty", "Set the repetition alpha frequency penalty") + + /** @group param */ + val presencePenalty = + new FloatParam(this, "presencePenalty", "Set the repetition alpha presence penalty") + + /** @group param */ + val miroStat = new Param[String](this, "miroStat", "Set MiroStat sampling strategies.") + + /** @group param */ + val miroStatTau = + new FloatParam(this, "mirostatTau", "Set the MiroStat target entropy, parameter tau") + + /** @group param */ + val miroStatEta = + new FloatParam(this, "mirostatEta", "Set the MiroStat learning rate, parameter eta") + + /** @group param */ + val penalizeNl = new BooleanParam(this, "penalizeNl", "Whether to penalize newline tokens") + + /** @group param */ + val nKeep = + new IntParam(this, "nKeep", "Set the number of tokens to keep from the initial prompt") + + /** @group param */ + val seed = new IntParam(this, "seed", "Set the RNG seed") + + /** @group param */ + val nProbs = new IntParam( + this, + "nProbs", + "Set the amount top tokens probabilities to output if greater than 0.") + + /** @group param */ + val minKeep = new IntParam( + this, + "minKeep", + "Set the amount of tokens the samplers should return at least (0 = disabled)") + + /** @group param */ + val grammar = + new Param[String](this, "grammar", "Set BNF-like grammar to constrain generations") + + /** @group param */ + val penaltyPrompt = new Param[String]( + this, + "penaltyPrompt", + "Override which part of the prompt is penalized for repetition.") + + /** @group param */ + val ignoreEos = new BooleanParam( + this, + "ignoreEos", + "Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)") + + // Modify the likelihood of tokens appearing in the completion by their id. + val tokenIdBias: StructFeature[Map[Int, Float]] = + new StructFeature[Map[Int, Float]](this, "tokenIdBias") + + // Modify the likelihood of tokens appearing in the completion by their string. + /** @group param */ + val tokenBias: StructFeature[Map[String, Float]] = + new StructFeature[Map[String, Float]](this, "tokenBias") + + /** @group param */ + val disableTokenIds = + new IntArrayParam(this, "disableTokenIds", "Set the token ids to disable in the completion") + + /** @group param */ + val stopStrings = new StringArrayParam( + this, + "stopStrings", + "Set strings upon seeing which token generation is stopped") + + /** @group param */ + val samplers = new StringArrayParam( + this, + "samplers", + "Set which samplers to use for token generation in the given order") + + /** @group param */ + val useChatTemplate = new BooleanParam( + this, + "useChatTemplate", + "Set whether or not generate should apply a chat template") + + /** Set the prompt to start generation with + * + * @group setParam + */ + def setInputPrefix(inputPrefix: String): this.type = { set(this.inputPrefix, inputPrefix) } + + /** Set a suffix for infilling + * + * @group setParam + */ + def setInputSuffix(inputSuffix: String): this.type = { set(this.inputSuffix, inputSuffix) } + + /** Whether to remember the prompt to avoid reprocessing it + * + * @group setParam + */ + def setCachePrompt(cachePrompt: Boolean): this.type = { set(this.cachePrompt, cachePrompt) } + + /** Set the number of tokens to predict + * + * @group setParam + */ + def setNPredict(nPredict: Int): this.type = { set(this.nPredict, nPredict) } + + /** Set top-k sampling + * + * @group setParam + */ + def setTopK(topK: Int): this.type = { set(this.topK, topK) } + + /** Set top-p sampling + * + * @group setParam + */ + def setTopP(topP: Float): this.type = { set(this.topP, topP) } + + /** Set min-p sampling + * + * @group setParam + */ + def setMinP(minP: Float): this.type = { set(this.minP, minP) } + + /** Set tail free sampling, parameter z + * @group setParam + */ + def setTfsZ(tfsZ: Float): this.type = { set(this.tfsZ, tfsZ) } + + /** Set locally typical sampling, parameter p + * + * @group setParam + */ + def setTypicalP(typicalP: Float): this.type = { set(this.typicalP, typicalP) } + + /** Set the temperature + * + * @group setParam + */ + def setTemperature(temperature: Float): this.type = { set(this.temperature, temperature) } + + /** Set the dynamic temperature range + * + * @group setParam + */ + def setDynamicTemperatureRange(dynatempRange: Float): this.type = { + set(this.dynamicTemperatureRange, dynatempRange) + } + + /** Set the dynamic temperature exponent + * + * @group setParam + */ + def setDynamicTemperatureExponent(dynatempExponent: Float): this.type = { + set(this.dynamicTemperatureExponent, dynatempExponent) + } + + /** Set the last n tokens to consider for penalties + * + * @group setParam + */ + def setRepeatLastN(repeatLastN: Int): this.type = { set(this.repeatLastN, repeatLastN) } + + /** Set the penalty of repeated sequences of tokens + * + * @group setParam + */ + def setRepeatPenalty(repeatPenalty: Float): this.type = { + set(this.repeatPenalty, repeatPenalty) + } + + /** Set the repetition alpha frequency penalty + * + * @group setParam + */ + def setFrequencyPenalty(frequencyPenalty: Float): this.type = { + set(this.frequencyPenalty, frequencyPenalty) + } + + /** Set the repetition alpha presence penalty + * + * @group setParam + */ + def setPresencePenalty(presencePenalty: Float): this.type = { + set(this.presencePenalty, presencePenalty) + } + + /** Set MiroStat sampling strategies. + * + * - DISABLED: No MiroStat + * - V1: MiroStat V1 + * - V2: MiroStat V2 + * + * @group setParam + */ + def setMiroStat(mirostat: String): this.type = set(this.miroStat, mirostat) + + /** Set the MiroStat target entropy, parameter tau + * + * @group setParam + */ + def setMiroStatTau(mirostatTau: Float): this.type = { set(this.miroStatTau, mirostatTau) } + + /** Set the MiroStat learning rate, parameter eta + * + * @group setParam + */ + def setMiroStatEta(mirostatEta: Float): this.type = { set(this.miroStatEta, mirostatEta) } + + /** Set whether to penalize newline tokens + * + * @group setParam + */ + def setPenalizeNl(penalizeNl: Boolean): this.type = { set(this.penalizeNl, penalizeNl) } + + /** Set the number of tokens to keep from the initial prompt + * + * @group setParam + */ + def setNKeep(nKeep: Int): this.type = { set(this.nKeep, nKeep) } + + /** Set the RNG seed + * + * @group setParam + */ + def setSeed(seed: Int): this.type = { set(this.seed, seed) } + + /** Set the amount top tokens probabilities to output if greater than 0. + * + * @group setParam + */ + def setNProbs(nProbs: Int): this.type = { set(this.nProbs, nProbs) } + + /** Set the amount of tokens the samplers should return at least (0 = disabled) + * + * @group setParam + */ + def setMinKeep(minKeep: Int): this.type = { set(this.minKeep, minKeep) } + + /** Set BNF-like grammar to constrain generations + * + * @group setParam + */ + def setGrammar(grammar: String): this.type = { set(this.grammar, grammar) } + + /** Override which part of the prompt is penalized for repetition. + * + * @group setParam + */ + def setPenaltyPrompt(penaltyPrompt: String): this.type = { + set(this.penaltyPrompt, penaltyPrompt) + } + + /** Set whether to ignore end of stream token and continue generating (implies --logit-bias + * 2-inf) + * + * @group setParam + */ + def setIgnoreEos(ignoreEos: Boolean): this.type = { set(this.ignoreEos, ignoreEos) } + + /** Set the tokens to disable during completion. + * + * @group setParam + */ + def setTokenBias(tokenBias: Map[String, Float]): this.type = { + set(this.tokenBias, tokenBias) + } + + /** Set the tokens to disable during completion. (Override for PySpark) + * + * @group setParam + */ + def setTokenBias(tokenBias: java.util.HashMap[String, java.lang.Double]): this.type = { + val scalaTokenBias = tokenBias.asScala.map { case (k, v) => k -> v.floatValue() } + set(this.tokenBias, scalaTokenBias.toMap) + } + + /** Set the token ids to disable in the completion. + * + * @group setParam + */ + def setTokenIdBias(tokenIdBias: Map[Int, Float]): this.type = { + set(this.tokenIdBias, tokenIdBias) + } + + /** Set the token ids to disable in the completion. (Override for PySpark) + * + * @group setParam + */ + def setTokenIdBias(tokenIdBias: java.util.HashMap[Integer, java.lang.Double]): this.type = { + val scalaTokenIdBias = tokenIdBias.asScala.map { case (k, v) => k.toInt -> v.toFloat } + set(this.tokenIdBias, scalaTokenIdBias.toMap) + } + + /** Set the token ids to disable in the completion. This corresponds to `setTokenBias` with a + * value of `Float.NEGATIVE_INFINITY`. + * + * @group setParam + */ + def setDisableTokenIds(disableTokenIds: Array[Int]): this.type = { + set(this.disableTokenIds, disableTokenIds) + } + + /** Set strings upon seeing which token generation is stopped + * + * @group setParam + */ + def setStopStrings(stopStrings: Array[String]): this.type = { + set(this.stopStrings, stopStrings) + } + + /** Set which samplers to use for token generation in the given order . + * + * Available Samplers are: + * + * - TOP_K: Top-k sampling + * - TFS_Z: Tail free sampling + * - TYPICAL_P: Locally typical sampling p + * - TOP_P: Top-p sampling + * - MIN_P: Min-p sampling + * - TEMPERATURE: Temperature sampling + * @group setParam + */ + def setSamplers(samplers: Array[String]): this.type = { set(this.samplers, samplers) } + + /** Set whether or not generate should apply a chat template + * + * @group setParam + */ + def setUseChatTemplate(useChatTemplate: Boolean): this.type = { + set(this.useChatTemplate, useChatTemplate) + } + + // ---------------- GETTERS ---------------- + /** @group getParam */ + def getInputPrefix: String = $(inputPrefix) + + /** @group getParam */ + def getInputSuffix: String = $(inputSuffix) + + /** @group getParam */ + def getCachePrompt: Boolean = $(cachePrompt) + + def getNPredict: Int = $(nPredict) + + /** @group getParam */ + def getTopK: Int = $(topK) + + /** @group getParam */ + def getTopP: Float = $(topP) + + /** @group getParam */ + def getMinP: Float = $(minP) + + /** @group getParam */ + def getTfsZ: Float = $(tfsZ) + + /** @group getParam */ + def getTypicalP: Float = $(typicalP) + + /** @group getParam */ + def getTemperature: Float = $(temperature) + + /** @group getParam */ + def getDynamicTemperatureRange: Float = $(dynamicTemperatureRange) + + /** @group getParam */ + def getDynamicTemperatureExponent: Float = $(dynamicTemperatureExponent) + + /** @group getParam */ + def getRepeatLastN: Int = $(repeatLastN) + + /** @group getParam */ + def getRepeatPenalty: Float = $(repeatPenalty) + + /** @group getParam */ + def getFrequencyPenalty: Float = $(frequencyPenalty) + + /** @group getParam */ + def getPresencePenalty: Float = $(presencePenalty) + + /** @group getParam */ + def getMiroStat: String = $(miroStat) + + /** @group getParam */ + def getMiroStatTau: Float = $(miroStatTau) + + /** @group getParam */ + def getMiroStatEta: Float = $(miroStatEta) + + /** @group getParam */ + def getPenalizeNl: Boolean = $(penalizeNl) + + /** @group getParam */ + def getNKeep: Int = $(nKeep) + + /** @group getParam */ + def getSeed: Int = $(seed) + + /** @group getParam */ + def getNProbs: Int = $(nProbs) + + /** @group getParam */ + def getMinKeep: Int = $(minKeep) + + /** @group getParam */ + def getGrammar: String = $(grammar) + + /** @group getParam */ + def getPenaltyPrompt: String = $(penaltyPrompt) + + /** @group getParam */ + def getIgnoreEos: Boolean = $(ignoreEos) + + /** @group getParam */ + def getTokenIdBias: Map[Int, Float] = $$(tokenIdBias) + + /** @group getParam */ + def getTokenBias: Map[String, Float] = $$(tokenBias) + + /** @group getParam */ + def getDisableTokenIds: Array[Int] = $(disableTokenIds) + + /** @group getParam */ + def getStopStrings: Array[String] = $(stopStrings) + + /** @group getParam */ + def getSamplers: Array[String] = $(samplers) + + /** @group getParam */ + def getUseChatTemplate: Boolean = $(useChatTemplate) + + protected def getInferenceParameters: InferenceParameters = { + val inferenceParams = new InferenceParameters("") + if (isDefined(cachePrompt)) inferenceParams.setCachePrompt(getCachePrompt) + if (isDefined(disableTokenIds)) { + val javaCollection: java.util.Collection[Integer] = + getDisableTokenIds.map(int2Integer).toSeq.asJava + inferenceParams.disableTokenIds(javaCollection) + } + if (isDefined(dynamicTemperatureExponent)) + inferenceParams.setDynamicTemperatureExponent(getDynamicTemperatureExponent) + if (isDefined(dynamicTemperatureRange)) + inferenceParams.setDynamicTemperatureRange(getDynamicTemperatureRange) + if (isDefined(frequencyPenalty)) inferenceParams.setFrequencyPenalty(getFrequencyPenalty) + if (isDefined(grammar)) inferenceParams.setGrammar(getGrammar) + if (isDefined(ignoreEos)) inferenceParams.setIgnoreEos(getIgnoreEos) + if (isDefined(inputPrefix)) inferenceParams.setInputPrefix(getInputPrefix) + if (isDefined(inputSuffix)) inferenceParams.setInputSuffix(getInputSuffix) + if (isDefined(minKeep)) inferenceParams.setMinKeep(getMinKeep) + if (isDefined(minP)) inferenceParams.setMinP(getMinP) + if (isDefined(miroStat)) inferenceParams.setMiroStat(MiroStat.valueOf(getMiroStat)) + if (isDefined(miroStatEta)) inferenceParams.setMiroStatEta(getMiroStatEta) + if (isDefined(miroStatTau)) inferenceParams.setMiroStatTau(getMiroStatTau) + if (isDefined(nKeep)) inferenceParams.setNKeep(getNKeep) + if (isDefined(nPredict)) inferenceParams.setNPredict(getNPredict) + if (isDefined(nProbs)) inferenceParams.setNProbs(getNProbs) + if (isDefined(penalizeNl)) inferenceParams.setPenalizeNl(getPenalizeNl) + if (isDefined(penaltyPrompt)) inferenceParams.setPenaltyPrompt(getPenaltyPrompt) + if (isDefined(presencePenalty)) inferenceParams.setPresencePenalty(getPresencePenalty) + if (isDefined(repeatLastN)) inferenceParams.setRepeatLastN(getRepeatLastN) + if (isDefined(repeatPenalty)) inferenceParams.setRepeatPenalty(getRepeatPenalty) + if (isDefined(samplers)) inferenceParams.setSamplers(getSamplers.map(Sampler.valueOf): _*) + if (isDefined(seed)) inferenceParams.setSeed(getSeed) + if (isDefined(stopStrings)) inferenceParams.setStopStrings(getStopStrings: _*) + if (isDefined(temperature)) inferenceParams.setTemperature(getTemperature) + if (isDefined(tfsZ)) inferenceParams.setTfsZ(getTfsZ) + if (isDefined(topK)) inferenceParams.setTopK(getTopK) + if (isDefined(topP)) inferenceParams.setTopP(getTopP) + if (isDefined(typicalP)) inferenceParams.setTypicalP(getTypicalP) + if (isDefined(useChatTemplate)) inferenceParams.setUseChatTemplate(getUseChatTemplate) + if (tokenBias.isSet) { + val tokenBiasMap: mutable.Map[String, java.lang.Float] = mutable.Map(getTokenBias.map { + case (key, value) => (key, float2Float(value)) + }.toSeq: _*) + inferenceParams.setTokenBias(tokenBiasMap.asJava) + } + if (tokenIdBias.isSet) { + val tokenIdBiasMap: mutable.Map[Integer, java.lang.Float] = + mutable.Map(getTokenIdBias.map { case (key, value) => + (int2Integer(key), float2Float(value)) + }.toSeq: _*) + inferenceParams.setTokenIdBias(tokenIdBiasMap.asJava) + } + + inferenceParams + } + +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppModelProperties.scala b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppModelProperties.scala new file mode 100644 index 00000000000000..e71a7b999f25c2 --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppModelProperties.scala @@ -0,0 +1,853 @@ +package com.johnsnowlabs.nlp + +import com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel +import com.johnsnowlabs.nlp.llama.ModelParameters +import com.johnsnowlabs.nlp.llama.args.{GpuSplitMode, NumaStrategy, PoolingType, RopeScalingType} +import com.johnsnowlabs.nlp.serialization.StructFeature +import org.apache.spark.ml.param._ +import org.apache.spark.sql.SparkSession +import org.json4s.DefaultFormats +import org.json4s.jackson.JsonMethods +import org.slf4j.LoggerFactory + +import scala.collection.mutable +import scala.jdk.CollectionConverters._ + +/** Contains settable model parameters for the [[AutoGGUFModel]]. + * + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupprio setParam 1 + * @groupprio getParam 2 + * @groupprio param 3 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ +trait HasLlamaCppModelProperties { + this: ParamsAndFeaturesWritable with HasProtectedParams => + protected val logger = LoggerFactory.getLogger(this.getClass) + + /** @group param */ + val nThreads = + new IntParam(this, "nThreads", "Set the number of threads to use during generation") + + /** @group param */ + val nThreadsDraft = new IntParam( + this, + "nThreadsDraft", + "Set the number of threads to use during draft generation") + + /** @group param */ + val nThreadsBatch = new IntParam( + this, + "nThreadsBatch", + "Set the number of threads to use during batch and prompt processing") + + /** @group param */ + val nThreadsBatchDraft = new IntParam( + this, + "nThreadsBatchDraft", + "Set the number of threads to use during batch and prompt processing") + + /** @group param */ + val nCtx = new IntParam(this, "nCtx", "Set the size of the prompt context") + + /** @group param */ + val nBatch = new IntParam( + this, + "nBatch", + "Set the logical batch size for prompt processing (must be >=32 to use BLAS)") + + /** @group param */ + val nUbatch = new IntParam( + this, + "nUbatch", + "Set the physical batch size for prompt processing (must be >=32 to use BLAS)") + + /** @group param */ + val nDraft = + new IntParam(this, "nDraft", "Set the number of tokens to draft for speculative decoding") + + /** @group param */ + val nChunks = new IntParam(this, "nChunks", "Set the maximal number of chunks to process") + + /** @group param */ + val nSequences = + new IntParam(this, "nSequences", "Set the number of sequences to decode") + + /** @group param */ + val pSplit = new FloatParam(this, "pSplit", "Set the speculative decoding split probability") + + /** @group param */ + val nGpuLayers = new IntParam( + this, + "nGpuLayers", + "Set the number of layers to store in VRAM (-1 - use default)") + + /** @group param */ + val nGpuLayersDraft = new IntParam( + this, + "nGpuLayersDraft", + "Set the number of layers to store in VRAM for the draft model (-1 - use default)") + + /** Set how to split the model across GPUs + * + * - NONE: No GPU split + * - LAYER: Split the model across GPUs by layer + * - ROW: Split the model across GPUs by rows + * + * @group param + */ + val gpuSplitMode = + new Param[String](this, "gpuSplitMode", "Set how to split the model across GPUs") + + /** @group param */ + val mainGpu = + new IntParam(this, "mainGpu", "Set the main GPU that is used for scratch and small tensors.") + + /** @group param */ + val tensorSplit = new DoubleArrayParam( + this, + "tensorSplit", + "Set how split tensors should be distributed across GPUs") + + /** @group param */ + val grpAttnN = new IntParam(this, "grpAttnN", "Set the group-attention factor") + + /** @group param */ + val grpAttnW = new IntParam(this, "grpAttnW", "Set the group-attention width") + + /** @group param */ + val ropeFreqBase = + new FloatParam(this, "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling") + + /** @group param */ + val ropeFreqScale = new FloatParam( + this, + "ropeFreqScale", + "Set the RoPE frequency scaling factor, expands context by a factor of 1/N") + + /** @group param */ + val yarnExtFactor = + new FloatParam(this, "yarnExtFactor", "Set the YaRN extrapolation mix factor") + + /** @group param */ + val yarnAttnFactor = + new FloatParam(this, "yarnAttnFactor", "Set the YaRN scale sqrt(t) or attention magnitude") + + /** @group param */ + val yarnBetaFast = + new FloatParam(this, "yarnBetaFast", "Set the YaRN low correction dim or beta") + + /** @group param */ + val yarnBetaSlow = + new FloatParam(this, "yarnBetaSlow", "Set the YaRN high correction dim or alpha") + + /** @group param */ + val yarnOrigCtx = + new IntParam(this, "yarnOrigCtx", "Set the YaRN original context size of model") + + /** @group param */ + val defragmentationThreshold = + new FloatParam(this, "defragmentationThreshold", "Set the KV cache defragmentation threshold") + + /** Set optimization strategies that help on some NUMA systems (if available) + * + * Available Strategies: + * + * - DISABLED: No NUMA optimizations + * - DISTRIBUTE: Spread execution evenly over all + * - ISOLATE: Only spawn threads on CPUs on the node that execution started on + * - NUMA_CTL: Use the CPU map provided by numactl + * - MIRROR: Mirrors the model across NUMA nodes + * + * @group param + */ + val numaStrategy = new Param[String]( + this, + "numaStrategy", + "Set optimization strategies that help on some NUMA systems (if available)") + + /** Set the RoPE frequency scaling method, defaults to linear unless specified by the model. + * + * - UNSPECIFIED: Don't use any scaling + * - LINEAR: Linear scaling + * - YARN: YaRN RoPE scaling + * + * @group param + */ + val ropeScalingType = new Param[String]( + this, + "ropeScalingType", + "Set the RoPE frequency scaling method, defaults to linear unless specified by the model") + + /** Set the pooling type for embeddings, use model default if unspecified + * + * - 0 NONE: Don't use any pooling + * - 1 MEAN: Mean Pooling + * - 2 CLS: Choose the CLS token + * - 3 LAST: Choose the last token + * + * @group param + */ + val poolingType = new Param[String]( + this, + "poolingType", + "Set the pooling type for embeddings, use model default if unspecified") + + /** @group param */ + val modelDraft = + new Param[String](this, "modelDraft", "Set the draft model for speculative decoding") + + /** @group param */ + val lookupCacheStaticFilePath = new Param[String]( + this, + "lookupCacheStaticFilePath", + "Set path to static lookup cache to use for lookup decoding (not updated by generation)") + + /** @group param */ + val lookupCacheDynamicFilePath = new Param[String]( + this, + "lookupCacheDynamicFilePath", + "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)") + + /** @group param */ + val loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters") + + /** @group param */ + val embedding = + new BooleanParam(this, "embedding", "Whether to load model with embedding support") + + /** @group param */ + val flashAttention = + new BooleanParam(this, "flashAttention", "Whether to enable Flash Attention") + + /** @group param */ + val inputPrefixBos = new BooleanParam( + this, + "inputPrefixBos", + "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string") + + /** @group param */ + val useMmap = new BooleanParam( + this, + "useMmap", + "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)") + + /** @group param */ + val useMlock = new BooleanParam( + this, + "useMlock", + "Whether to force the system to keep model in RAM rather than swapping or compressing") + + /** @group param */ + val noKvOffload = new BooleanParam(this, "noKvOffload", "Whether to disable KV offload") + + /** @group param */ + val systemPrompt = new Param[String](this, "systemPrompt", "Set a system prompt to use") + + /** @group param */ + val chatTemplate = + new Param[String](this, "chatTemplate", "The chat template to use") + + private def checkEmbeddingMode(setter: => this.type): this.type = { + if (getEmbedding) { + logger.warn("Embeddings enabled. This parameter has no effect.") + this + } else + setter + } + + /** Set the number of threads to use during generation + * + * @group setParam + */ + def setNThreads(nThreads: Int): this.type = { + set(this.nThreads, nThreads) + } + + /** Set the number of threads to use during draft generation + * + * @group setParam + */ + def setNThreadsDraft(nThreadsDraft: Int): this.type = { + checkEmbeddingMode { set(this.nThreadsDraft, nThreadsDraft) } + } + + /** Set the number of threads to use during batch and prompt processing + * + * @group setParam + */ + def setNThreadsBatch(nThreadsBatch: Int): this.type = { + checkEmbeddingMode { set(this.nThreadsBatch, nThreadsBatch) } + } + + /** Set the number of threads to use during batch and prompt processing + * + * @group setParam + */ + def setNThreadsBatchDraft(nThreadsBatchDraft: Int): this.type = { + checkEmbeddingMode { set(this.nThreadsBatchDraft, nThreadsBatchDraft) } + } + + /** Set the size of the prompt context + * + * @group setParam + */ + def setNCtx(nCtx: Int): this.type = { + set(this.nCtx, nCtx) + } + + /** Set the logical batch size for prompt processing (must be >=32 to use BLAS) + * + * @group setParam + */ + def setNBatch(nBatch: Int): this.type = { + set(this.nBatch, nBatch) + } + + /** Set the physical batch size for prompt processing (must be >=32 to use BLAS) + * + * @group setParam + */ + def setNUbatch(nUbatch: Int): this.type = { + set(this.nUbatch, nUbatch) + } + + /** Set the number of tokens to draft for speculative decoding + * + * @group setParam + */ + def setNDraft(nDraft: Int): this.type = { + checkEmbeddingMode { set(this.nDraft, nDraft) } + } + + /** Set the maximal number of chunks to process + * + * @group setParam + */ + def setNChunks(nChunks: Int): this.type = { + set(this.nChunks, nChunks) + } + + /** Set the number of sequences to decode + * + * @group setParam + */ + def setNSequences(nSequences: Int): this.type = { + set(this.nSequences, nSequences) + } + + /** Set the speculative decoding split probability + * + * @group setParam + */ + def setPSplit(pSplit: Float): this.type = { + checkEmbeddingMode { set(this.pSplit, pSplit) } + } + + /** Set the number of layers to store in VRAM (-1 - use default) + * + * @group setParam + */ + def setNGpuLayers(nGpuLayers: Int): this.type = { + set(this.nGpuLayers, nGpuLayers) + } + + /** Set the number of layers to store in VRAM for the draft model (-1 - use default) + * + * @group setParam + */ + def setNGpuLayersDraft(nGpuLayersDraft: Int): this.type = { + checkEmbeddingMode { set(this.nGpuLayersDraft, nGpuLayersDraft) } + } + + /** Set how to split the model across GPUs + * + * - NONE: No GPU split + * -LAYER: Split the model across GPUs by layer 2. ROW: Split the model across GPUs by rows + * + * @group setParam + */ + def setGpuSplitMode(splitMode: String): this.type = { + set(this.gpuSplitMode, splitMode) + } + + /** Set the GPU that is used for scratch and small tensors + * + * @group setParam + */ + def setMainGpu(mainGpu: Int): this.type = { + set(this.mainGpu, mainGpu) + } + + /** Set how split tensors should be distributed across GPUs + * + * @group setParam + */ + def setTensorSplit(tensorSplit: Array[Double]): this.type = { + set(this.tensorSplit, tensorSplit) + } + + /** Set the group-attention factor + * + * @group setParam + */ + def setGrpAttnN(grpAttnN: Int): this.type = { + set(this.grpAttnN, grpAttnN) + } + + /** Set the group-attention width + * + * @group setParam + */ + def setGrpAttnW(grpAttnW: Int): this.type = { + set(this.grpAttnW, grpAttnW) + } + + /** Set the RoPE base frequency, used by NTK-aware scaling + * + * @group setParam + */ + def setRopeFreqBase(ropeFreqBase: Float): this.type = { + set(this.ropeFreqBase, ropeFreqBase) + } + + /** Set the RoPE frequency scaling factor, expands context by a factor of 1/N + * + * @group setParam + */ + def setRopeFreqScale(ropeFreqScale: Float): this.type = { + set(this.ropeFreqScale, ropeFreqScale) + } + + /** Set the YaRN extrapolation mix factor + * + * @group setParam + */ + def setYarnExtFactor(yarnExtFactor: Float): this.type = { + set(this.yarnExtFactor, yarnExtFactor) + } + + /** Set the YaRN scale sqrt(t) or attention magnitude + * + * @group setParam + */ + def setYarnAttnFactor(yarnAttnFactor: Float): this.type = { + set(this.yarnAttnFactor, yarnAttnFactor) + } + + /** Set the YaRN low correction dim or beta + * + * @group setParam + */ + def setYarnBetaFast(yarnBetaFast: Float): this.type = { + set(this.yarnBetaFast, yarnBetaFast) + } + + /** Set the YaRN high correction dim or alpha + * + * @group setParam + */ + def setYarnBetaSlow(yarnBetaSlow: Float): this.type = { + set(this.yarnBetaSlow, yarnBetaSlow) + } + + /** Set the YaRN original context size of model + * + * @group setParam + */ + def setYarnOrigCtx(yarnOrigCtx: Int): this.type = { + set(this.yarnOrigCtx, yarnOrigCtx) + } + + /** Set the KV cache defragmentation threshold + * + * @group setParam + */ + def setDefragmentationThreshold(defragThold: Float): this.type = { + set(this.defragmentationThreshold, defragThold) + } + + /** Set optimization strategies that help on some NUMA systems (if available) + * + * Available Strategies: + * + * - DISABLED: No NUMA optimizations + * - DISTRIBUTE: spread execution evenly over all + * - ISOLATE: only spawn threads on CPUs on the node that execution started on + * - NUMA_CTL: use the CPU map provided by numactl + * - MIRROR: Mirrors the model across NUMA nodes + * + * @group setParam + */ + def setNumaStrategy(numa: String): this.type = { + val numaUpper = numa.toUpperCase + val numaStrategies = Array("DISABLED", "DISTRIBUTE", "ISOLATE", "NUMA_CTL", "MIRROR") + require( + numaStrategies.contains(numaUpper), + s"Invalid NUMA strategy: $numa. " + + s"Valid values are: ${numaStrategies.mkString(", ")}") + set(this.numaStrategy, numaUpper) + } + + /** Set the RoPE frequency scaling method, defaults to linear unless specified by the model. + * + * - UNSPECIFIED: Don't use any scaling + * - LINEAR: Linear scaling + * - YARN: YaRN RoPE scaling + * + * @group setParam + */ + def setRopeScalingType(ropeScalingType: String): this.type = { + set(this.ropeScalingType, ropeScalingType) + } + + /** Set the pooling type for embeddings, use model default if unspecified + * + * - 0 NONE: Don't use any pooling and return token embeddings (if the model supports it) + * - 1 MEAN: Mean Pooling + * - 2 CLS: Choose the CLS token + * - 3 LAST: Choose the last token + * + * @group setParam + */ + def setPoolingType(poolingType: String): this.type = { + val poolingTypeUpper = poolingType.toUpperCase + val poolingTypes = Array("NONE", "MEAN", "CLS", "LAST") + require( + poolingTypes.contains(poolingTypeUpper), + s"Invalid pooling type: $poolingType. " + + s"Valid values are: ${poolingTypes.mkString(", ")}") + set(this.poolingType, poolingTypeUpper) + } + + /** Set the draft model for speculative decoding + * + * @group setParam + */ + def setModelDraft(modelDraft: String): this.type = { + checkEmbeddingMode { set(this.modelDraft, modelDraft) } + } + + /** Set path to static lookup cache to use for lookup decoding (not updated by generation) + * + * @group setParam + */ + def setLookupCacheStaticFilePath(lookupCacheStaticFilePath: String): this.type = { + checkEmbeddingMode { set(this.lookupCacheStaticFilePath, lookupCacheStaticFilePath) } + } + + /** Set path to dynamic lookup cache to use for lookup decoding (updated by generation) + * + * @group setParam + */ + def setLookupCacheDynamicFilePath(lookupCacheDynamicFilePath: String): this.type = { + checkEmbeddingMode { set(this.lookupCacheDynamicFilePath, lookupCacheDynamicFilePath) } + } + + /** Sets paths to lora adapters with user defined scale. + * + * @group setParam + */ + def setLoraAdapters(loraAdapters: Map[String, Float]): this.type = { + set(this.loraAdapters, loraAdapters) + } + + /** Sets paths to lora adapters with user defined scale. (PySpark Override) + * + * @group setParam + */ + def setLoraAdapters(loraAdapters: java.util.HashMap[String, java.lang.Double]): this.type = { + val scalaLoraAdapters = loraAdapters.asScala.map { case (k, v) => k -> v.floatValue() } + set(this.loraAdapters, scalaLoraAdapters.toMap) + } + + /** Whether to load model with embedding support + * + * @group setParam + */ + def setEmbedding(embedding: Boolean): this.type = { + set(this.embedding, embedding) + } + + /** Whether to enable Flash Attention + * + * @group setParam + */ + def setFlashAttention(flashAttention: Boolean): this.type = { + set(this.flashAttention, flashAttention) + } + + /** Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string + * + * @group setParam + */ + def setInputPrefixBos(inputPrefixBos: Boolean): this.type = { + set(this.inputPrefixBos, inputPrefixBos) + } + + /** Whether to use memory-map model (faster load but may increase pageouts if not using mlock) + * + * @group setParam + */ + def setUseMmap(useMmap: Boolean): this.type = { + set(this.useMmap, useMmap) + } + + /** Whether to force the system to keep model in RAM rather than swapping or compressing + * + * @group setParam + */ + def setUseMlock(useMlock: Boolean): this.type = { + set(this.useMlock, useMlock) + } + + /** Whether to disable KV offload + * + * @group setParam + */ + def setNoKvOffload(noKvOffload: Boolean): this.type = { + set(this.noKvOffload, noKvOffload) + } + + /** Set a system prompt to use + * + * @group setParam + */ + def setSystemPrompt(systemPrompt: String): this.type = { + checkEmbeddingMode { set(this.systemPrompt, systemPrompt) } + } + + /** The chat template to use + * + * @group setParam + */ + def setChatTemplate(chatTemplate: String): this.type = { + checkEmbeddingMode { set(this.chatTemplate, chatTemplate) } + } + + /** @group getParam */ + def getNThreads: Int = $(nThreads) + + /** @group getParam */ + def getNThreadsDraft: Int = $(nThreadsDraft) + + /** @group getParam */ + def getNThreadsBatch: Int = $(nThreadsBatch) + + /** @group getParam */ + def getNThreadsBatchDraft: Int = $(nThreadsBatchDraft) + + /** @group getParam */ + def getNCtx: Int = $(nCtx) + + /** @group getParam */ + def getNBatch: Int = $(nBatch) + + /** @group getParam */ + def getNUbatch: Int = $(nUbatch) + + /** @group getParam */ + def getNDraft: Int = $(nDraft) + + /** @group getParam */ + def getNChunks: Int = $(nChunks) + + /** @group getParam */ + def getNSequences: Int = $(nSequences) + + /** @group getParam */ + def getPSplit: Float = $(pSplit) + + /** @group getParam */ + def getNGpuLayers: Int = $(nGpuLayers) + + /** @group getParam */ + def getNGpuLayersDraft: Int = $(nGpuLayersDraft) + + /** @group getParam */ + def getSplitMode: String = $(gpuSplitMode) + + /** @group getParam */ + def getMainGpu: Int = $(mainGpu) + + /** @group getParam */ + def getTensorSplit: Array[Double] = $(tensorSplit) + + def getGrpAttnN: Int = $(grpAttnN) + + /** @group getParam */ + def getGrpAttnW: Int = $(grpAttnW) + + /** @group getParam */ + def getRopeFreqBase: Float = $(ropeFreqBase) + + /** @group getParam */ + def getRopeFreqScale: Float = $(ropeFreqScale) + + /** @group getParam */ + def getYarnExtFactor: Float = $(yarnExtFactor) + + /** @group getParam */ + def getYarnAttnFactor: Float = $(yarnAttnFactor) + + /** @group getParam */ + def getYarnBetaFast: Float = $(yarnBetaFast) + + /** @group getParam */ + def getYarnBetaSlow: Float = $(yarnBetaSlow) + + /** @group getParam */ + def getYarnOrigCtx: Int = $(yarnOrigCtx) + + /** @group getParam */ + def getDefragmentationThreshold: Float = $(defragmentationThreshold) + + /** @group getParam */ + def getNuma: String = $(numaStrategy) + + /** @group getParam */ + def getRopeScalingType: String = $(ropeScalingType) + + /** @group getParam */ + def getPoolingType: String = $(poolingType) + + /** @group getParam */ + def getModelDraft: String = $(modelDraft) + + /** @group getParam */ + def getLookupCacheStaticFilePath: String = $(lookupCacheStaticFilePath) + + /** @group getParam */ + def getLookupCacheDynamicFilePath: String = $(lookupCacheDynamicFilePath) + + /** @group getParam */ + def getLoraAdapters: Map[String, Float] = $$(loraAdapters) + + /** @group getParam */ + def getEmbedding: Boolean = $(embedding) + + /** @group getParam */ + def getFlashAttention: Boolean = $(flashAttention) + + /** @group getParam */ + def getInputPrefixBos: Boolean = $(inputPrefixBos) + + /** @group getParam */ + def getUseMmap: Boolean = $(useMmap) + + /** @group getParam */ + def getUseMlock: Boolean = $(useMlock) + + /** @group getParam */ + def getNoKvOffload: Boolean = $(noKvOffload) + + /** @group getParam */ + def getSystemPrompt: String = $(systemPrompt) + + /** @group getParam */ + def getChatTemplate: String = $(chatTemplate) + + // ---------------- METADATA ---------------- + val metadata = + new Param[String](this, "metadata", "Set the metadata for the model").setProtected() + + /** Set the metadata for the model + * @group setParam + */ + def setMetadata(metadata: String): this.type = { set(this.metadata, metadata) } + + /** Get the metadata for the model + * @group getParam + */ + def getMetadata: String = $(metadata) + + def getMetadataMap: Map[String, String] = { + val metadataJsonString = getMetadata + if (metadataJsonString.isEmpty) Map.empty + else { + implicit val formats: DefaultFormats.type = DefaultFormats + JsonMethods.parse(metadataJsonString).extract[Map[String, String]] + } + } + + protected def getModelParameters: ModelParameters = { + val modelParameters = new ModelParameters().setContinuousBatching(true) // Always enabled + + if (isDefined(chatTemplate)) modelParameters.setChatTemplate(getChatTemplate) + if (isDefined(defragmentationThreshold)) + modelParameters.setDefragmentationThreshold(getDefragmentationThreshold) + if (isDefined(embedding)) modelParameters.setEmbedding(getEmbedding) + if (isDefined(flashAttention)) modelParameters.setFlashAttention(getFlashAttention) + if (isDefined(gpuSplitMode)) + modelParameters.setSplitMode(GpuSplitMode.valueOf(getSplitMode)) + if (isDefined(grpAttnN)) modelParameters.setGrpAttnN(getGrpAttnN) + if (isDefined(grpAttnW)) modelParameters.setGrpAttnN(getGrpAttnW) + if (isDefined(inputPrefixBos)) modelParameters.setInputPrefixBos(getInputPrefixBos) + if (isDefined(lookupCacheDynamicFilePath)) + modelParameters.setLookupCacheDynamicFilePath(getLookupCacheDynamicFilePath) + if (isDefined(lookupCacheStaticFilePath)) + modelParameters.setLookupCacheStaticFilePath(getLookupCacheStaticFilePath) + if (isDefined(mainGpu)) modelParameters.setMainGpu(getMainGpu) + if (isDefined(modelDraft)) modelParameters.setModelDraft(getModelDraft) + if (isDefined(nBatch)) modelParameters.setNBatch(getNBatch) + if (isDefined(nChunks)) modelParameters.setNChunks(getNChunks) + if (isDefined(nCtx)) modelParameters.setNCtx(getNCtx) + if (isDefined(nDraft)) modelParameters.setNDraft(getNDraft) + if (isDefined(nGpuLayers)) modelParameters.setNGpuLayers(getNGpuLayers) + if (isDefined(nGpuLayersDraft)) modelParameters.setNGpuLayersDraft(getNGpuLayersDraft) + if (isDefined(nSequences)) modelParameters.setNSequences(getNSequences) + if (isDefined(nThreads)) modelParameters.setNThreads(getNThreads) + if (isDefined(nThreadsBatch)) modelParameters.setNThreadsBatch(getNThreadsBatch) + if (isDefined(nThreadsBatchDraft)) + modelParameters.setNThreadsBatchDraft(getNThreadsBatchDraft) + if (isDefined(nThreadsDraft)) modelParameters.setNThreadsDraft(getNThreadsDraft) + if (isDefined(nUbatch)) modelParameters.setNUbatch(getNUbatch) + if (isDefined(noKvOffload)) modelParameters.setNoKvOffload(getNoKvOffload) + if (isDefined(numaStrategy)) modelParameters.setNuma(NumaStrategy.valueOf(getNuma)) + if (isDefined(pSplit)) modelParameters.setPSplit(getPSplit) + if (isDefined(poolingType)) + modelParameters.setPoolingType(PoolingType.valueOf(getPoolingType)) + if (isDefined(ropeFreqBase)) modelParameters.setRopeFreqBase(getRopeFreqBase) + if (isDefined(ropeFreqScale)) modelParameters.setRopeFreqScale(getRopeFreqScale) + if (isDefined(ropeScalingType)) + modelParameters.setRopeScalingType(RopeScalingType.valueOf(getRopeScalingType)) + if (isDefined(systemPrompt)) modelParameters.setSystemPrompt(getSystemPrompt) + if (isDefined(tensorSplit)) modelParameters.setTensorSplit(getTensorSplit.map(_.toFloat)) + if (isDefined(useMlock)) modelParameters.setUseMlock(getUseMlock) + if (isDefined(useMmap)) modelParameters.setUseMmap(getUseMmap) + if (isDefined(yarnAttnFactor)) modelParameters.setYarnAttnFactor(getYarnAttnFactor) + if (isDefined(yarnBetaFast)) modelParameters.setYarnBetaFast(getYarnBetaFast) + if (isDefined(yarnBetaSlow)) modelParameters.setYarnBetaSlow(getYarnBetaSlow) + if (isDefined(yarnExtFactor)) modelParameters.setYarnExtFactor(getYarnExtFactor) + if (isDefined(yarnOrigCtx)) modelParameters.setYarnOrigCtx(getYarnOrigCtx) + if (loraAdapters.isSet) { + val loraAdaptersMap: mutable.Map[String, java.lang.Float] = + mutable.Map(getLoraAdapters.map { case (key, value) => + (key, float2Float(value)) + }.toSeq: _*) + modelParameters.setLoraAdapters(loraAdaptersMap.asJava) + } // Need to convert to mutable map first + + modelParameters + } + + // ---------------- GPU SUPPORT ---------------- + // Values for automatic GPU support + protected val defaultGpuLayers = 1000 + protected val defaultMainGpu = 0 + + // Entrypoint for models. Automatically set GPU support if detected. + protected def setGpuSupportIfAvailable(spark: SparkSession): this.type = { + val usingGPUJar: Boolean = spark.sparkContext.listJars.exists(_.contains("spark-nlp-gpu")) + if (usingGPUJar) { + logger.info("Using GPU jar. Offloading all layers to GPU.") + setMainGpu(defaultMainGpu) + setNGpuLayers(defaultGpuLayers) + } + this + } +} diff --git a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppProperties.scala b/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppProperties.scala deleted file mode 100644 index e6d832eef9a79f..00000000000000 --- a/src/main/scala/com/johnsnowlabs/nlp/HasLlamaCppProperties.scala +++ /dev/null @@ -1,1292 +0,0 @@ -package com.johnsnowlabs.nlp - -import com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel -import com.johnsnowlabs.nlp.llama.args._ -import com.johnsnowlabs.nlp.llama.{InferenceParameters, ModelParameters} -import com.johnsnowlabs.nlp.serialization.StructFeature -import org.apache.spark.ml.param._ -import org.slf4j.LoggerFactory - -import scala.collection.mutable -import scala.jdk.CollectionConverters._ - -/** Contains settable parameters for the [[AutoGGUFModel]]. - * - * @groupname param Parameters - * @groupname setParam Parameter setters - * @groupname getParam Parameter getters - * @groupprio setParam 1 - * @groupprio getParam 2 - * @groupprio param 3 - * @groupdesc param - * A list of (hyper-)parameter keys this annotator can take. Users can set and get the - * parameter values through setters and getters, respectively. - */ -trait HasLlamaCppProperties { - this: ParamsAndFeaturesWritable with HasProtectedParams => - val logger = LoggerFactory.getLogger(this.getClass) - // ---------------- MODEL PARAMETERS ---------------- - /** @group param */ - val nThreads = - new IntParam(this, "nThreads", "Set the number of threads to use during generation") - - /** @group param */ - val nThreadsDraft = new IntParam( - this, - "nThreadsDraft", - "Set the number of threads to use during draft generation") - - /** @group param */ - val nThreadsBatch = new IntParam( - this, - "nThreadsBatch", - "Set the number of threads to use during batch and prompt processing") - - /** @group param */ - val nThreadsBatchDraft = new IntParam( - this, - "nThreadsBatchDraft", - "Set the number of threads to use during batch and prompt processing") - - /** @group param */ - val nCtx = new IntParam(this, "nCtx", "Set the size of the prompt context") - - /** @group param */ - val nBatch = new IntParam( - this, - "nBatch", - "Set the logical batch size for prompt processing (must be >=32 to use BLAS)") - - /** @group param */ - val nUbatch = new IntParam( - this, - "nUbatch", - "Set the physical batch size for prompt processing (must be >=32 to use BLAS)") - - /** @group param */ - val nDraft = - new IntParam(this, "nDraft", "Set the number of tokens to draft for speculative decoding") - - /** @group param */ - val nChunks = new IntParam(this, "nChunks", "Set the maximal number of chunks to process") - - /** @group param */ - val nSequences = - new IntParam(this, "nSequences", "Set the number of sequences to decode") - - /** @group param */ - val pSplit = new FloatParam(this, "pSplit", "Set the speculative decoding split probability") - - /** @group param */ - val nGpuLayers = new IntParam( - this, - "nGpuLayers", - "Set the number of layers to store in VRAM (-1 - use default)") - - /** @group param */ - val nGpuLayersDraft = new IntParam( - this, - "nGpuLayersDraft", - "Set the number of layers to store in VRAM for the draft model (-1 - use default)") - - /** Set how to split the model across GPUs - * - * - NONE: No GPU split - * - LAYER: Split the model across GPUs by layer - * - ROW: Split the model across GPUs by rows - * - * @group param - */ - val gpuSplitMode = - new Param[String](this, "gpuSplitMode", "Set how to split the model across GPUs") - - /** @group param */ - val mainGpu = - new IntParam(this, "mainGpu", "Set the main GPU that is used for scratch and small tensors.") - - /** @group param */ - val tensorSplit = new DoubleArrayParam( - this, - "tensorSplit", - "Set how split tensors should be distributed across GPUs") - - /** @group param */ - val grpAttnN = new IntParam(this, "grpAttnN", "Set the group-attention factor") - - /** @group param */ - val grpAttnW = new IntParam(this, "grpAttnW", "Set the group-attention width") - - /** @group param */ - val ropeFreqBase = - new FloatParam(this, "ropeFreqBase", "Set the RoPE base frequency, used by NTK-aware scaling") - - /** @group param */ - val ropeFreqScale = new FloatParam( - this, - "ropeFreqScale", - "Set the RoPE frequency scaling factor, expands context by a factor of 1/N") - - /** @group param */ - val yarnExtFactor = - new FloatParam(this, "yarnExtFactor", "Set the YaRN extrapolation mix factor") - - /** @group param */ - val yarnAttnFactor = - new FloatParam(this, "yarnAttnFactor", "Set the YaRN scale sqrt(t) or attention magnitude") - - /** @group param */ - val yarnBetaFast = - new FloatParam(this, "yarnBetaFast", "Set the YaRN low correction dim or beta") - - /** @group param */ - val yarnBetaSlow = - new FloatParam(this, "yarnBetaSlow", "Set the YaRN high correction dim or alpha") - - /** @group param */ - val yarnOrigCtx = - new IntParam(this, "yarnOrigCtx", "Set the YaRN original context size of model") - - /** @group param */ - val defragmentationThreshold = - new FloatParam(this, "defragmentationThreshold", "Set the KV cache defragmentation threshold") - - /** Set optimization strategies that help on some NUMA systems (if available) - * - * Available Strategies: - * - * - DISABLED: No NUMA optimizations - * - DISTRIBUTE: Spread execution evenly over all - * - ISOLATE: Only spawn threads on CPUs on the node that execution started on - * - NUMA_CTL: Use the CPU map provided by numactl - * - MIRROR: Mirrors the model across NUMA nodes - * - * @group param - */ - val numaStrategy = new Param[String]( - this, - "numaStrategy", - "Set optimization strategies that help on some NUMA systems (if available)") - - /** Set the RoPE frequency scaling method, defaults to linear unless specified by the model. - * - * - UNSPECIFIED: Don't use any scaling - * - LINEAR: Linear scaling - * - YARN: YaRN RoPE scaling - * @group param - */ - val ropeScalingType = new Param[String]( - this, - "ropeScalingType", - "Set the RoPE frequency scaling method, defaults to linear unless specified by the model") - - /** Set the pooling type for embeddings, use model default if unspecified - * - * - 0 UNSPECIFIED: Don't use any pooling - * - 1 MEAN: Mean Pooling - * - 2 CLS: CLS Pooling - * - * @group param - */ - val poolingType = new Param[String]( - this, - "poolingType", - "Set the pooling type for embeddings, use model default if unspecified") - // model = new Param[String](this, "model", "Set the model file path to load") - /** @group param */ - val modelDraft = - new Param[String](this, "modelDraft", "Set the draft model for speculative decoding") - - // modelAlias = new Param[String](this, "modelAlias", "Set a model alias") - /** @group param */ - val lookupCacheStaticFilePath = new Param[String]( - this, - "lookupCacheStaticFilePath", - "Set path to static lookup cache to use for lookup decoding (not updated by generation)") - - /** @group param */ - val lookupCacheDynamicFilePath = new Param[String]( - this, - "lookupCacheDynamicFilePath", - "Set path to dynamic lookup cache to use for lookup decoding (updated by generation)") - - /** @group param */ - val loraAdapters = new StructFeature[Map[String, Float]](this, "loraAdapters") - - val embedding = - new BooleanParam(this, "embedding", "Whether to load model with embedding support") - - /** @group param */ - val flashAttention = - new BooleanParam(this, "flashAttention", "Whether to enable Flash Attention") - - /** @group param */ - val inputPrefixBos = new BooleanParam( - this, - "inputPrefixBos", - "Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string") - - /** @group param */ - val useMmap = new BooleanParam( - this, - "useMmap", - "Whether to use memory-map model (faster load but may increase pageouts if not using mlock)") - - /** @group param */ - val useMlock = new BooleanParam( - this, - "useMlock", - "Whether to force the system to keep model in RAM rather than swapping or compressing") - - /** @group param */ - val noKvOffload = new BooleanParam(this, "noKvOffload", "Whether to disable KV offload") - - /** @group param */ - val systemPrompt = new Param[String](this, "systemPrompt", "Set a system prompt to use") - - /** @group param */ - val chatTemplate = - new Param[String](this, "chatTemplate", "The chat template to use") - - /** Set the number of threads to use during generation - * - * @group setParam - */ - def setNThreads(nThreads: Int): this.type = { set(this.nThreads, nThreads) } - - /** Set the number of threads to use during draft generation - * - * @group setParam - */ - def setNThreadsDraft(nThreadsDraft: Int): this.type = { set(this.nThreadsDraft, nThreadsDraft) } - - /** Set the number of threads to use during batch and prompt processing - * - * @group setParam - */ - def setNThreadsBatch(nThreadsBatch: Int): this.type = { set(this.nThreadsBatch, nThreadsBatch) } - - /** Set the number of threads to use during batch and prompt processing - * - * @group setParam - */ - def setNThreadsBatchDraft(nThreadsBatchDraft: Int): this.type = { - set(this.nThreadsBatchDraft, nThreadsBatchDraft) - } - - /** Set the size of the prompt context - * - * @group setParam - */ - def setNCtx(nCtx: Int): this.type = { set(this.nCtx, nCtx) } - - /** Set the logical batch size for prompt processing (must be >=32 to use BLAS) - * - * @group setParam - */ - def setNBatch(nBatch: Int): this.type = { set(this.nBatch, nBatch) } - - /** Set the physical batch size for prompt processing (must be >=32 to use BLAS) - * - * @group setParam - */ - def setNUbatch(nUbatch: Int): this.type = { set(this.nUbatch, nUbatch) } - - /** Set the number of tokens to draft for speculative decoding - * - * @group setParam - */ - def setNDraft(nDraft: Int): this.type = { set(this.nDraft, nDraft) } - - /** Set the maximal number of chunks to process - * - * @group setParam - */ - def setNChunks(nChunks: Int): this.type = { set(this.nChunks, nChunks) } - - /** Set the number of sequences to decode - * - * @group setParam - */ - def setNSequences(nSequences: Int): this.type = { set(this.nSequences, nSequences) } - - /** Set the speculative decoding split probability - * - * @group setParam - */ - def setPSplit(pSplit: Float): this.type = { set(this.pSplit, pSplit) } - - /** Set the number of layers to store in VRAM (-1 - use default) - * - * @group setParam - */ - def setNGpuLayers(nGpuLayers: Int): this.type = { set(this.nGpuLayers, nGpuLayers) } - - /** Set the number of layers to store in VRAM for the draft model (-1 - use default) - * - * @group setParam - */ - def setNGpuLayersDraft(nGpuLayersDraft: Int): this.type = { - set(this.nGpuLayersDraft, nGpuLayersDraft) - } - - /** Set how to split the model across GPUs - * - * - NONE: No GPU split - * -LAYER: Split the model across GPUs by layer 2. ROW: Split the model across GPUs by rows - * - * @group setParam - */ - def setGpuSplitMode(splitMode: String): this.type = { set(this.gpuSplitMode, splitMode) } - - /** Set the GPU that is used for scratch and small tensors - * - * @group setParam - */ - def setMainGpu(mainGpu: Int): this.type = { set(this.mainGpu, mainGpu) } - - /** Set how split tensors should be distributed across GPUs - * - * @group setParam - */ - def setTensorSplit(tensorSplit: Array[Double]): this.type = { - set(this.tensorSplit, tensorSplit) - } - - /** Set the group-attention factor - * - * @group setParam - */ - def setGrpAttnN(grpAttnN: Int): this.type = { set(this.grpAttnN, grpAttnN) } - - /** Set the group-attention width - * - * @group setParam - */ - def setGrpAttnW(grpAttnW: Int): this.type = { set(this.grpAttnW, grpAttnW) } - - /** Set the RoPE base frequency, used by NTK-aware scaling - * - * @group setParam - */ - def setRopeFreqBase(ropeFreqBase: Float): this.type = { set(this.ropeFreqBase, ropeFreqBase) } - - /** Set the RoPE frequency scaling factor, expands context by a factor of 1/N - * - * @group setParam - */ - def setRopeFreqScale(ropeFreqScale: Float): this.type = { - set(this.ropeFreqScale, ropeFreqScale) - } - - /** Set the YaRN extrapolation mix factor - * - * @group setParam - */ - def setYarnExtFactor(yarnExtFactor: Float): this.type = { - set(this.yarnExtFactor, yarnExtFactor) - } - - /** Set the YaRN scale sqrt(t) or attention magnitude - * - * @group setParam - */ - def setYarnAttnFactor(yarnAttnFactor: Float): this.type = { - set(this.yarnAttnFactor, yarnAttnFactor) - } - - /** Set the YaRN low correction dim or beta - * - * @group setParam - */ - def setYarnBetaFast(yarnBetaFast: Float): this.type = { set(this.yarnBetaFast, yarnBetaFast) } - - /** Set the YaRN high correction dim or alpha - * - * @group setParam - */ - def setYarnBetaSlow(yarnBetaSlow: Float): this.type = { set(this.yarnBetaSlow, yarnBetaSlow) } - - /** Set the YaRN original context size of model - * - * @group setParam - */ - def setYarnOrigCtx(yarnOrigCtx: Int): this.type = { set(this.yarnOrigCtx, yarnOrigCtx) } - - /** Set the KV cache defragmentation threshold - * - * @group setParam - */ - def setDefragmentationThreshold(defragThold: Float): this.type = { - set(this.defragmentationThreshold, defragThold) - } - - /** Set optimization strategies that help on some NUMA systems (if available) - * - * Available Strategies: - * - * - DISABLED: No NUMA optimizations - * - DISTRIBUTE: spread execution evenly over all - * - ISOLATE: only spawn threads on CPUs on the node that execution started on - * - NUMA_CTL: use the CPU map provided by numactl - * - MIRROR: Mirrors the model across NUMA nodes - * - * @group setParam - */ - def setNumaStrategy(numa: String): this.type = { set(this.numaStrategy, numa) } - - /** Set the RoPE frequency scaling method, defaults to linear unless specified by the model. - * - * - UNSPECIFIED: Don't use any scaling - * - LINEAR: Linear scaling - * - YARN: YaRN RoPE scaling - * @group setParam - */ - def setRopeScalingType(ropeScalingType: String): this.type = { - set(this.ropeScalingType, ropeScalingType) - } - - /** Set the pooling type for embeddings, use model default if unspecified - * - * - UNSPECIFIED: Don't use any pooling - * - MEAN: Mean Pooling - * - CLS: CLS Pooling - * - * @group setParam - */ - def setPoolingType(poolingType: String): this.type = { set(this.poolingType, poolingType) } - - /** Set the draft model for speculative decoding - * - * @group setParam - */ - def setModelDraft(modelDraft: String): this.type = { set(this.modelDraft, modelDraft) } - - /** Set a model alias - * - * @group setParam - */ - def setLookupCacheStaticFilePath(lookupCacheStaticFilePath: String): this.type = { - set(this.lookupCacheStaticFilePath, lookupCacheStaticFilePath) - } - - /** Set a model alias - * - * @group setParam - */ - def setLookupCacheDynamicFilePath(lookupCacheDynamicFilePath: String): this.type = { - set(this.lookupCacheDynamicFilePath, lookupCacheDynamicFilePath) - } - - /** Sets paths to lora adapters with user defined scale. - * - * @group setParam - */ - def setLoraAdapters(loraAdapters: Map[String, Float]): this.type = { - set(this.loraAdapters, loraAdapters) - } - - /** Sets paths to lora adapters with user defined scale. (PySpark Override) - * - * @group setParam - */ - def setLoraAdapters(loraAdapters: java.util.HashMap[String, java.lang.Double]): this.type = { - val scalaLoraAdapters = loraAdapters.asScala.map { case (k, v) => k -> v.floatValue() } - set(this.loraAdapters, scalaLoraAdapters.toMap) - } - - /** Whether to load model with embedding support - * - * @group setParam - */ - def setEmbedding(embedding: Boolean): this.type = { set(this.embedding, embedding) } - - /** Whether to enable Flash Attention - * - * @group setParam - */ - def setFlashAttention(flashAttention: Boolean): this.type = { - set(this.flashAttention, flashAttention) - } - - /** Whether to add prefix BOS to user inputs, preceding the `--in-prefix` string - * - * @group setParam - */ - def setInputPrefixBos(inputPrefixBos: Boolean): this.type = { - set(this.inputPrefixBos, inputPrefixBos) - } - - /** Whether to use memory-map model (faster load but may increase pageouts if not using mlock) - * - * @group setParam - */ - def setUseMmap(useMmap: Boolean): this.type = { set(this.useMmap, useMmap) } - - /** Whether to force the system to keep model in RAM rather than swapping or compressing - * - * @group setParam - */ - def setUseMlock(useMlock: Boolean): this.type = { set(this.useMlock, useMlock) } - - /** Whether to disable KV offload - * - * @group setParam - */ - def setNoKvOffload(noKvOffload: Boolean): this.type = { set(this.noKvOffload, noKvOffload) } - - /** Set a system prompt to use - * - * @group setParam - */ - def setSystemPrompt(systemPrompt: String): this.type = { set(this.systemPrompt, systemPrompt) } - - /** The chat template to use - * - * @group setParam - */ - def setChatTemplate(chatTemplate: String): this.type = { set(this.chatTemplate, chatTemplate) } - - // ---------------- GETTERS ---------------- - /** @group getParam */ - def getNThreads: Int = $(nThreads) - - /** @group getParam */ - def getNThreadsDraft: Int = $(nThreadsDraft) - - /** @group getParam */ - def getNThreadsBatch: Int = $(nThreadsBatch) - - /** @group getParam */ - def getNThreadsBatchDraft: Int = $(nThreadsBatchDraft) - - /** @group getParam */ - def getNCtx: Int = $(nCtx) - - /** @group getParam */ - def getNBatch: Int = $(nBatch) - - /** @group getParam */ - def getNUbatch: Int = $(nUbatch) - - /** @group getParam */ - def getNDraft: Int = $(nDraft) - - /** @group getParam */ - def getNChunks: Int = $(nChunks) - - /** @group getParam */ - def getNSequences: Int = $(nSequences) - - /** @group getParam */ - def getPSplit: Float = $(pSplit) - - /** @group getParam */ - def getNGpuLayers: Int = $(nGpuLayers) - - /** @group getParam */ - def getNGpuLayersDraft: Int = $(nGpuLayersDraft) - - /** @group getParam */ - def getSplitMode: String = $(gpuSplitMode) - - /** @group getParam */ - def getMainGpu: Int = $(mainGpu) - - /** @group getParam */ - def getTensorSplit: Array[Double] = $(tensorSplit) - - def getGrpAttnN: Int = $(grpAttnN) - - /** @group getParam */ - def getGrpAttnW: Int = $(grpAttnW) - - /** @group getParam */ - def getRopeFreqBase: Float = $(ropeFreqBase) - - /** @group getParam */ - def getRopeFreqScale: Float = $(ropeFreqScale) - - /** @group getParam */ - def getYarnExtFactor: Float = $(yarnExtFactor) - - /** @group getParam */ - def getYarnAttnFactor: Float = $(yarnAttnFactor) - - /** @group getParam */ - def getYarnBetaFast: Float = $(yarnBetaFast) - - /** @group getParam */ - def getYarnBetaSlow: Float = $(yarnBetaSlow) - - /** @group getParam */ - def getYarnOrigCtx: Int = $(yarnOrigCtx) - - /** @group getParam */ - def getDefragmentationThreshold: Float = $(defragmentationThreshold) - - /** @group getParam */ - def getNuma: String = $(numaStrategy) - - /** @group getParam */ - def getRopeScalingType: String = $(ropeScalingType) - - /** @group getParam */ - def getPoolingType: String = $(poolingType) - - /** @group getParam */ - def getModelDraft: String = $(modelDraft) - - /** @group getParam */ - def getLookupCacheStaticFilePath: String = $(lookupCacheStaticFilePath) - - /** @group getParam */ - def getLookupCacheDynamicFilePath: String = $(lookupCacheDynamicFilePath) - - /** @group getParam */ - def getLoraAdapters: Map[String, Float] = $$(loraAdapters) - - /** @group getParam */ - def getEmbedding: Boolean = $(embedding) - - /** @group getParam */ - def getFlashAttention: Boolean = $(flashAttention) - - /** @group getParam */ - def getInputPrefixBos: Boolean = $(inputPrefixBos) - - /** @group getParam */ - def getUseMmap: Boolean = $(useMmap) - - /** @group getParam */ - def getUseMlock: Boolean = $(useMlock) - - /** @group getParam */ - def getNoKvOffload: Boolean = $(noKvOffload) - - /** @group getParam */ - def getSystemPrompt: String = $(systemPrompt) - - /** @group getParam */ - def getChatTemplate: String = $(chatTemplate) - - // ---------------- INFERENCE PARAMETERS ---------------- - /** @group param */ - val inputPrefix = - new Param[String](this, "inputPrefix", "Set the prompt to start generation with") - - /** @group param */ - val inputSuffix = - new Param[String](this, "inputSuffix", "Set a suffix for infilling") - - /** @group param */ - val cachePrompt = new BooleanParam( - this, - "cachePrompt", - "Whether to remember the prompt to avoid reprocessing it") - - /** @group param */ - val nPredict = new IntParam(this, "nPredict", "Set the number of tokens to predict") - - /** @group param */ - val topK = new IntParam(this, "topK", "Set top-k sampling") - - /** @group param */ - val topP = new FloatParam(this, "topP", "Set top-p sampling") - - /** @group param */ - val minP = new FloatParam(this, "minP", "Set min-p sampling") - - /** @group param */ - val tfsZ = new FloatParam(this, "tfsZ", "Set tail free sampling, parameter z") - - /** @group param */ - val typicalP = new FloatParam(this, "typicalP", "Set locally typical sampling, parameter p") - - /** @group param */ - val temperature = new FloatParam(this, "temperature", "Set the temperature") - - /** @group param */ - val dynamicTemperatureRange = - new FloatParam(this, "dynatempRange", "Set the dynamic temperature range") - - /** @group param */ - val dynamicTemperatureExponent = - new FloatParam(this, "dynatempExponent", "Set the dynamic temperature exponent") - - /** @group param */ - val repeatLastN = - new IntParam(this, "repeatLastN", "Set the last n tokens to consider for penalties") - - /** @group param */ - val repeatPenalty = - new FloatParam(this, "repeatPenalty", "Set the penalty of repeated sequences of tokens") - - /** @group param */ - val frequencyPenalty = - new FloatParam(this, "frequencyPenalty", "Set the repetition alpha frequency penalty") - - /** @group param */ - val presencePenalty = - new FloatParam(this, "presencePenalty", "Set the repetition alpha presence penalty") - - /** @group param */ - val miroStat = new Param[String](this, "miroStat", "Set MiroStat sampling strategies.") - - /** @group param */ - val miroStatTau = - new FloatParam(this, "mirostatTau", "Set the MiroStat target entropy, parameter tau") - - /** @group param */ - val miroStatEta = - new FloatParam(this, "mirostatEta", "Set the MiroStat learning rate, parameter eta") - - /** @group param */ - val penalizeNl = new BooleanParam(this, "penalizeNl", "Whether to penalize newline tokens") - - /** @group param */ - val nKeep = - new IntParam(this, "nKeep", "Set the number of tokens to keep from the initial prompt") - - /** @group param */ - val seed = new IntParam(this, "seed", "Set the RNG seed") - - /** @group param */ - val nProbs = new IntParam( - this, - "nProbs", - "Set the amount top tokens probabilities to output if greater than 0.") - - /** @group param */ - val minKeep = new IntParam( - this, - "minKeep", - "Set the amount of tokens the samplers should return at least (0 = disabled)") - - /** @group param */ - val grammar = - new Param[String](this, "grammar", "Set BNF-like grammar to constrain generations") - - /** @group param */ - val penaltyPrompt = new Param[String]( - this, - "penaltyPrompt", - "Override which part of the prompt is penalized for repetition.") - - /** @group param */ - val ignoreEos = new BooleanParam( - this, - "ignoreEos", - "Set whether to ignore end of stream token and continue generating (implies --logit-bias 2-inf)") - - // Modify the likelihood of tokens appearing in the completion by their id. - val tokenIdBias: StructFeature[Map[Int, Float]] = - new StructFeature[Map[Int, Float]](this, "tokenIdBias") - - // Modify the likelihood of tokens appearing in the completion by their string. - /** @group param */ - val tokenBias: StructFeature[Map[String, Float]] = - new StructFeature[Map[String, Float]](this, "tokenBias") - - /** @group param */ - val disableTokenIds = - new IntArrayParam(this, "disableTokenIds", "Set the token ids to disable in the completion") - - /** @group param */ - val stopStrings = new StringArrayParam( - this, - "stopStrings", - "Set strings upon seeing which token generation is stopped") - - /** @group param */ - val samplers = new StringArrayParam( - this, - "samplers", - "Set which samplers to use for token generation in the given order") - - /** @group param */ - val useChatTemplate = new BooleanParam( - this, - "useChatTemplate", - "Set whether or not generate should apply a chat template") - - /** Set the prompt to start generation with - * - * @group setParam - */ - def setInputPrefix(inputPrefix: String): this.type = { set(this.inputPrefix, inputPrefix) } - - /** Set a suffix for infilling - * - * @group setParam - */ - def setInputSuffix(inputSuffix: String): this.type = { set(this.inputSuffix, inputSuffix) } - - /** Whether to remember the prompt to avoid reprocessing it - * - * @group setParam - */ - def setCachePrompt(cachePrompt: Boolean): this.type = { set(this.cachePrompt, cachePrompt) } - - /** Set the number of tokens to predict - * - * @group setParam - */ - def setNPredict(nPredict: Int): this.type = { set(this.nPredict, nPredict) } - - /** Set top-k sampling - * - * @group setParam - */ - def setTopK(topK: Int): this.type = { set(this.topK, topK) } - - /** Set top-p sampling - * - * @group setParam - */ - def setTopP(topP: Float): this.type = { set(this.topP, topP) } - - /** Set min-p sampling - * - * @group setParam - */ - def setMinP(minP: Float): this.type = { set(this.minP, minP) } - - /** Set tail free sampling, parameter z - * @group setParam - */ - def setTfsZ(tfsZ: Float): this.type = { set(this.tfsZ, tfsZ) } - - /** Set locally typical sampling, parameter p - * - * @group setParam - */ - def setTypicalP(typicalP: Float): this.type = { set(this.typicalP, typicalP) } - - /** Set the temperature - * - * @group setParam - */ - def setTemperature(temperature: Float): this.type = { set(this.temperature, temperature) } - - /** Set the dynamic temperature range - * - * @group setParam - */ - def setDynamicTemperatureRange(dynatempRange: Float): this.type = { - set(this.dynamicTemperatureRange, dynatempRange) - } - - /** Set the dynamic temperature exponent - * - * @group setParam - */ - def setDynamicTemperatureExponent(dynatempExponent: Float): this.type = { - set(this.dynamicTemperatureExponent, dynatempExponent) - } - - /** Set the last n tokens to consider for penalties - * - * @group setParam - */ - def setRepeatLastN(repeatLastN: Int): this.type = { set(this.repeatLastN, repeatLastN) } - - /** Set the penalty of repeated sequences of tokens - * - * @group setParam - */ - def setRepeatPenalty(repeatPenalty: Float): this.type = { - set(this.repeatPenalty, repeatPenalty) - } - - /** Set the repetition alpha frequency penalty - * - * @group setParam - */ - def setFrequencyPenalty(frequencyPenalty: Float): this.type = { - set(this.frequencyPenalty, frequencyPenalty) - } - - /** Set the repetition alpha presence penalty - * - * @group setParam - */ - def setPresencePenalty(presencePenalty: Float): this.type = { - set(this.presencePenalty, presencePenalty) - } - - /** Set MiroStat sampling strategies. - * - * - DISABLED: No MiroStat - * - V1: MiroStat V1 - * - V2: MiroStat V2 - * - * @group setParam - */ - def setMiroStat(mirostat: String): this.type = set(this.miroStat, mirostat) - - /** Set the MiroStat target entropy, parameter tau - * - * @group setParam - */ - def setMiroStatTau(mirostatTau: Float): this.type = { set(this.miroStatTau, mirostatTau) } - - /** Set the MiroStat learning rate, parameter eta - * - * @group setParam - */ - def setMiroStatEta(mirostatEta: Float): this.type = { set(this.miroStatEta, mirostatEta) } - - /** Set whether to penalize newline tokens - * - * @group setParam - */ - def setPenalizeNl(penalizeNl: Boolean): this.type = { set(this.penalizeNl, penalizeNl) } - - /** Set the number of tokens to keep from the initial prompt - * - * @group setParam - */ - def setNKeep(nKeep: Int): this.type = { set(this.nKeep, nKeep) } - - /** Set the RNG seed - * - * @group setParam - */ - def setSeed(seed: Int): this.type = { set(this.seed, seed) } - - /** Set the amount top tokens probabilities to output if greater than 0. - * - * @group setParam - */ - def setNProbs(nProbs: Int): this.type = { set(this.nProbs, nProbs) } - - /** Set the amount of tokens the samplers should return at least (0 = disabled) - * - * @group setParam - */ - def setMinKeep(minKeep: Int): this.type = { set(this.minKeep, minKeep) } - - /** Set BNF-like grammar to constrain generations - * - * @group setParam - */ - def setGrammar(grammar: String): this.type = { set(this.grammar, grammar) } - - /** Override which part of the prompt is penalized for repetition. - * - * @group setParam - */ - def setPenaltyPrompt(penaltyPrompt: String): this.type = { - set(this.penaltyPrompt, penaltyPrompt) - } - - /** Set whether to ignore end of stream token and continue generating (implies --logit-bias - * 2-inf) - * - * @group setParam - */ - def setIgnoreEos(ignoreEos: Boolean): this.type = { set(this.ignoreEos, ignoreEos) } - - /** Set the tokens to disable during completion. - * - * @group setParam - */ - def setTokenBias(tokenBias: Map[String, Float]): this.type = { - set(this.tokenBias, tokenBias) - } - - /** Set the tokens to disable during completion. (Override for PySpark) - * - * @group setParam - */ - def setTokenBias(tokenBias: java.util.HashMap[String, java.lang.Double]): this.type = { - val scalaTokenBias = tokenBias.asScala.map { case (k, v) => k -> v.floatValue() } - set(this.tokenBias, scalaTokenBias.toMap) - } - - /** Set the token ids to disable in the completion. - * - * @group setParam - */ - def setTokenIdBias(tokenIdBias: Map[Int, Float]): this.type = { - set(this.tokenIdBias, tokenIdBias) - } - - /** Set the token ids to disable in the completion. (Override for PySpark) - * - * @group setParam - */ - def setTokenIdBias(tokenIdBias: java.util.HashMap[Integer, java.lang.Double]): this.type = { - val scalaTokenIdBias = tokenIdBias.asScala.map { case (k, v) => k.toInt -> v.toFloat } - set(this.tokenIdBias, scalaTokenIdBias.toMap) - } - - /** Set the token ids to disable in the completion. This corresponds to `setTokenBias` with a - * value of `Float.NEGATIVE_INFINITY`. - * - * @group setParam - */ - def setDisableTokenIds(disableTokenIds: Array[Int]): this.type = { - set(this.disableTokenIds, disableTokenIds) - } - - /** Set strings upon seeing which token generation is stopped - * - * @group setParam - */ - def setStopStrings(stopStrings: Array[String]): this.type = { - set(this.stopStrings, stopStrings) - } - - /** Set which samplers to use for token generation in the given order . - * - * Available Samplers are: - * - * - TOP_K: Top-k sampling - * - TFS_Z: Tail free sampling - * - TYPICAL_P: Locally typical sampling p - * - TOP_P: Top-p sampling - * - MIN_P: Min-p sampling - * - TEMPERATURE: Temperature sampling - * @group setParam - */ - def setSamplers(samplers: Array[String]): this.type = { set(this.samplers, samplers) } - - /** Set whether or not generate should apply a chat template - * - * @group setParam - */ - def setUseChatTemplate(useChatTemplate: Boolean): this.type = { - set(this.useChatTemplate, useChatTemplate) - } - - // ---------------- GETTERS ---------------- - /** @group getParam */ - def getInputPrefix: String = $(inputPrefix) - - /** @group getParam */ - def getInputSuffix: String = $(inputSuffix) - - /** @group getParam */ - def getCachePrompt: Boolean = $(cachePrompt) - - def getNPredict: Int = $(nPredict) - - /** @group getParam */ - def getTopK: Int = $(topK) - - /** @group getParam */ - def getTopP: Float = $(topP) - - /** @group getParam */ - def getMinP: Float = $(minP) - - /** @group getParam */ - def getTfsZ: Float = $(tfsZ) - - /** @group getParam */ - def getTypicalP: Float = $(typicalP) - - /** @group getParam */ - def getTemperature: Float = $(temperature) - - /** @group getParam */ - def getDynamicTemperatureRange: Float = $(dynamicTemperatureRange) - - /** @group getParam */ - def getDynamicTemperatureExponent: Float = $(dynamicTemperatureExponent) - - /** @group getParam */ - def getRepeatLastN: Int = $(repeatLastN) - - /** @group getParam */ - def getRepeatPenalty: Float = $(repeatPenalty) - - /** @group getParam */ - def getFrequencyPenalty: Float = $(frequencyPenalty) - - /** @group getParam */ - def getPresencePenalty: Float = $(presencePenalty) - - /** @group getParam */ - def getMiroStat: String = $(miroStat) - - /** @group getParam */ - def getMiroStatTau: Float = $(miroStatTau) - - /** @group getParam */ - def getMiroStatEta: Float = $(miroStatEta) - - /** @group getParam */ - def getPenalizeNl: Boolean = $(penalizeNl) - - /** @group getParam */ - def getNKeep: Int = $(nKeep) - - /** @group getParam */ - def getSeed: Int = $(seed) - - /** @group getParam */ - def getNProbs: Int = $(nProbs) - - /** @group getParam */ - def getMinKeep: Int = $(minKeep) - - /** @group getParam */ - def getGrammar: String = $(grammar) - - /** @group getParam */ - def getPenaltyPrompt: String = $(penaltyPrompt) - - /** @group getParam */ - def getIgnoreEos: Boolean = $(ignoreEos) - - /** @group getParam */ - def getTokenIdBias: Map[Int, Float] = $$(tokenIdBias) - - /** @group getParam */ - def getTokenBias: Map[String, Float] = $$(tokenBias) - - /** @group getParam */ - def getDisableTokenIds: Array[Int] = $(disableTokenIds) - - /** @group getParam */ - def getStopStrings: Array[String] = $(stopStrings) - - /** @group getParam */ - def getSamplers: Array[String] = $(samplers) - - /** @group getParam */ - def getUseChatTemplate: Boolean = $(useChatTemplate) - - protected def getModelParameters: ModelParameters = { - val modelParameters = new ModelParameters().setContinuousBatching(true) // Always enabled - - if (isDefined(chatTemplate)) modelParameters.setChatTemplate($(chatTemplate)) - if (isDefined(defragmentationThreshold)) - modelParameters.setDefragmentationThreshold($(defragmentationThreshold)) - if (isDefined(embedding)) modelParameters.setEmbedding($(embedding)) - if (isDefined(flashAttention)) modelParameters.setFlashAttention($(flashAttention)) - if (isDefined(gpuSplitMode)) - modelParameters.setSplitMode(GpuSplitMode.valueOf($(gpuSplitMode))) - if (isDefined(grpAttnN)) modelParameters.setGrpAttnN($(grpAttnN)) - if (isDefined(grpAttnW)) modelParameters.setGrpAttnN($(grpAttnW)) - if (isDefined(inputPrefixBos)) modelParameters.setInputPrefixBos($(inputPrefixBos)) - if (isDefined(lookupCacheDynamicFilePath)) - modelParameters.setLookupCacheDynamicFilePath($(lookupCacheDynamicFilePath)) - if (isDefined(lookupCacheStaticFilePath)) - modelParameters.setLookupCacheStaticFilePath($(lookupCacheStaticFilePath)) - if (isDefined(mainGpu)) modelParameters.setMainGpu($(mainGpu)) - if (isDefined(modelDraft)) modelParameters.setModelDraft($(modelDraft)) - if (isDefined(nBatch)) modelParameters.setNBatch($(nBatch)) - if (isDefined(nChunks)) modelParameters.setNChunks($(nChunks)) - if (isDefined(nCtx)) modelParameters.setNCtx($(nCtx)) - if (isDefined(nDraft)) modelParameters.setNDraft($(nDraft)) - if (isDefined(nGpuLayers)) modelParameters.setNGpuLayers($(nGpuLayers)) - if (isDefined(nGpuLayersDraft)) modelParameters.setNGpuLayersDraft($(nGpuLayersDraft)) - if (isDefined(nSequences)) modelParameters.setNSequences($(nSequences)) - if (isDefined(nThreads)) modelParameters.setNThreads($(nThreads)) - if (isDefined(nThreadsBatch)) modelParameters.setNThreadsBatch($(nThreadsBatch)) - if (isDefined(nThreadsBatchDraft)) - modelParameters.setNThreadsBatchDraft($(nThreadsBatchDraft)) - if (isDefined(nThreadsDraft)) modelParameters.setNThreadsDraft($(nThreadsDraft)) - if (isDefined(nUbatch)) modelParameters.setNUbatch($(nUbatch)) - if (isDefined(noKvOffload)) modelParameters.setNoKvOffload($(noKvOffload)) - if (isDefined(numaStrategy)) modelParameters.setNuma(NumaStrategy.valueOf($(numaStrategy))) - if (isDefined(pSplit)) modelParameters.setPSplit($(pSplit)) - if (isDefined(poolingType)) - modelParameters.setPoolingType(PoolingType.valueOf($(poolingType))) - if (isDefined(ropeFreqBase)) modelParameters.setRopeFreqBase($(ropeFreqBase)) - if (isDefined(ropeFreqScale)) modelParameters.setRopeFreqScale($(ropeFreqScale)) - if (isDefined(ropeScalingType)) - modelParameters.setRopeScalingType(RopeScalingType.valueOf($(ropeScalingType))) - if (isDefined(systemPrompt)) modelParameters.setSystemPrompt($(systemPrompt)) - if (isDefined(tensorSplit)) modelParameters.setTensorSplit($(tensorSplit).map(_.toFloat)) - if (isDefined(useMlock)) modelParameters.setUseMlock($(useMlock)) - if (isDefined(useMmap)) modelParameters.setUseMmap($(useMmap)) - if (isDefined(yarnAttnFactor)) modelParameters.setYarnAttnFactor($(yarnAttnFactor)) - if (isDefined(yarnBetaFast)) modelParameters.setYarnBetaFast($(yarnBetaFast)) - if (isDefined(yarnBetaSlow)) modelParameters.setYarnBetaSlow($(yarnBetaSlow)) - if (isDefined(yarnExtFactor)) modelParameters.setYarnExtFactor($(yarnExtFactor)) - if (isDefined(yarnOrigCtx)) modelParameters.setYarnOrigCtx($(yarnOrigCtx)) - if (loraAdapters.isSet) { - val loraAdaptersMap: mutable.Map[String, java.lang.Float] = - mutable.Map($$(loraAdapters).map { case (key, value) => - (key, float2Float(value)) - }.toSeq: _*) - modelParameters.setLoraAdapters(loraAdaptersMap.asJava) - } // Need to convert to mutable map first - - modelParameters - } - - protected def getInferenceParameters: InferenceParameters = { - val inferenceParams = new InferenceParameters("") - if (isDefined(cachePrompt)) inferenceParams.setCachePrompt($(cachePrompt)) - if (isDefined(disableTokenIds)) { - val javaCollection: java.util.Collection[Integer] = - $(disableTokenIds).map(int2Integer).toSeq.asJava - inferenceParams.disableTokenIds(javaCollection) - } - if (isDefined(dynamicTemperatureExponent)) - inferenceParams.setDynamicTemperatureExponent($(dynamicTemperatureExponent)) - if (isDefined(dynamicTemperatureRange)) - inferenceParams.setDynamicTemperatureRange($(dynamicTemperatureRange)) - if (isDefined(frequencyPenalty)) inferenceParams.setFrequencyPenalty($(frequencyPenalty)) - if (isDefined(grammar)) inferenceParams.setGrammar($(grammar)) - if (isDefined(ignoreEos)) inferenceParams.setIgnoreEos($(ignoreEos)) - if (isDefined(inputPrefix)) inferenceParams.setInputPrefix($(inputPrefix)) - if (isDefined(inputSuffix)) inferenceParams.setInputSuffix($(inputSuffix)) - if (isDefined(minKeep)) inferenceParams.setMinKeep($(minKeep)) - if (isDefined(minP)) inferenceParams.setMinP($(minP)) - if (isDefined(miroStat)) inferenceParams.setMiroStat(MiroStat.valueOf($(miroStat))) - if (isDefined(miroStatEta)) inferenceParams.setMiroStatEta($(miroStatEta)) - if (isDefined(miroStatTau)) inferenceParams.setMiroStatTau($(miroStatTau)) - if (isDefined(nKeep)) inferenceParams.setNKeep($(nKeep)) - if (isDefined(nPredict)) inferenceParams.setNPredict($(nPredict)) - if (isDefined(nProbs)) inferenceParams.setNProbs($(nProbs)) - if (isDefined(penalizeNl)) inferenceParams.setPenalizeNl($(penalizeNl)) - if (isDefined(penaltyPrompt)) inferenceParams.setPenaltyPrompt($(penaltyPrompt)) - if (isDefined(presencePenalty)) inferenceParams.setPresencePenalty($(presencePenalty)) - if (isDefined(repeatLastN)) inferenceParams.setRepeatLastN($(repeatLastN)) - if (isDefined(repeatPenalty)) inferenceParams.setRepeatPenalty($(repeatPenalty)) - if (isDefined(samplers)) inferenceParams.setSamplers($(samplers).map(Sampler.valueOf): _*) - if (isDefined(seed)) inferenceParams.setSeed($(seed)) - if (isDefined(stopStrings)) inferenceParams.setStopStrings($(stopStrings): _*) - if (isDefined(temperature)) inferenceParams.setTemperature($(temperature)) - if (isDefined(tfsZ)) inferenceParams.setTfsZ($(tfsZ)) - if (isDefined(topK)) inferenceParams.setTopK($(topK)) - if (isDefined(topP)) inferenceParams.setTopP($(topP)) - if (isDefined(typicalP)) inferenceParams.setTypicalP($(typicalP)) - if (isDefined(useChatTemplate)) inferenceParams.setUseChatTemplate($(useChatTemplate)) - if (tokenBias.isSet) { - val tokenBiasMap: mutable.Map[String, java.lang.Float] = mutable.Map($$(tokenBias).map { - case (key, value) => (key, float2Float(value)) - }.toSeq: _*) - inferenceParams.setTokenBias(tokenBiasMap.asJava) - } - if (tokenIdBias.isSet) { - val tokenIdBiasMap: mutable.Map[Integer, java.lang.Float] = - mutable.Map($$(tokenIdBias).map { case (key, value) => - (int2Integer(key), float2Float(value)) - }.toSeq: _*) - inferenceParams.setTokenIdBias(tokenIdBiasMap.asJava) - } - - inferenceParams - } - - // ---------------- METADATA ---------------- - val metadata = - new Param[String](this, "metadata", "Set the metadata for the model").setProtected() - - /** Set the metadata for the model - * @group setParam - */ - def setMetadata(metadata: String): this.type = { set(this.metadata, metadata) } - - /** Get the metadata for the model - * @group getParam - */ - def getMetadata: String = $(metadata) -} diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala index 27daac826bb595..efbd3a288896c1 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotator.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotator.scala @@ -828,4 +828,9 @@ package object annotator { object SnowFlakeEmbeddings extends ReadablePretrainedSnowFlakeModel with ReadSnowFlakeDLModel + type AutoGGUFEmbeddings = com.johnsnowlabs.nlp.embeddings.AutoGGUFEmbeddings + object AutoGGUFEmbeddings + extends ReadablePretrainedAutoGGUFEmbeddings + with ReadAutoGGUFEmbeddings + } diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala index 405e48f6d1195b..3caf4bdc0e8be2 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModel.scala @@ -23,14 +23,12 @@ import com.johnsnowlabs.nlp.util.io.ResourceHelper import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.SparkSession -import org.json4s.DefaultFormats -import org.json4s.jackson.JsonMethods /** Annotator that uses the llama.cpp library to generate text completions with large language * models. * - * For settable parameters, and their explanations, see [[HasLlamaCppProperties]] and refer to - * the llama.cpp documentation of + * For settable parameters, and their explanations, see [[HasLlamaCppInferenceProperties]], + * [[HasLlamaCppModelProperties]] and refer to the llama.cpp documentation of * [[https://github.com/ggerganov/llama.cpp/tree/7d5e8777ae1d21af99d4f95be10db4870720da91/examples/server server.cpp]] * for more information. * @@ -118,7 +116,8 @@ class AutoGGUFModel(override val uid: String) extends AnnotatorModel[AutoGGUFModel] with HasBatchedAnnotate[AutoGGUFModel] with HasEngine - with HasLlamaCppProperties + with HasLlamaCppModelProperties + with HasLlamaCppInferenceProperties with HasProtectedParams { override val outputAnnotatorType: AnnotatorType = AnnotatorType.DOCUMENT @@ -131,10 +130,6 @@ class AutoGGUFModel(override val uid: String) private var _model: Option[Broadcast[GGUFWrapper]] = None - // Values for automatic GPU support - private val defaultGpuLayers = 1000 - private val defaultMainGpu = 0 - /** @group getParam */ def getModelIfNotSet: GGUFWrapper = _model.get.value @@ -145,18 +140,18 @@ class AutoGGUFModel(override val uid: String) } // Entrypoint for models. Automatically set GPU support if detected. - val usingGPUJar: Boolean = spark.sparkContext.listJars.exists(_.contains("spark-nlp-gpu")) - if (usingGPUJar) { - logger.info("Using GPU jar. Offloading all layers to GPU.") - setMainGpu(defaultMainGpu) - setNGpuLayers(defaultGpuLayers) - } - this + setGpuSupportIfAvailable(spark) } private[johnsnowlabs] def setEngine(engineName: String): this.type = set(engine, engineName) - setDefault(engine -> LlamaCPP.name) + setDefault( + engine -> LlamaCPP.name, + useChatTemplate -> true, + nCtx -> 4096, + nBatch -> 512, + embedding -> false, + nPredict -> 100) override def onWrite(path: String, spark: SparkSession): Unit = { super.onWrite(path, spark) @@ -173,6 +168,7 @@ class AutoGGUFModel(override val uid: String) override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = { val annotations: Seq[Annotation] = batchedAnnotations.flatten if (annotations.nonEmpty) { + val annotationsText = annotations.map(_.result) val modelParams = getModelParameters.setNParallel(getBatchSize) // set parallel decoding to batch size @@ -180,18 +176,36 @@ class AutoGGUFModel(override val uid: String) val model: LlamaModel = getModelIfNotSet.getSession(modelParams) - val annotationsText = annotations.map(_.result) - - val (completedTexts: Array[String], metadata: Map[String, String]) = - try { - (model.requestBatchCompletion(annotationsText.toArray, inferenceParams), Map.empty) - } catch { - case e: Exception => - logger.error("Error in llama.cpp batch completion", e) - (Array[String](), Map("exception" -> e.getMessage)) + if (getEmbedding) { + // Return embeddings in annotation + val (embeddings: Array[Array[Float]], metadata: Map[String, String]) = + try { + (model.requestBatchEmbeddings(annotationsText.toArray), Map.empty) + } catch { + case e: Exception => + logger.error("Error in llama.cpp embeddings", e) + (Array.empty[Array[Float]], Map("llamacpp_exception" -> e.getMessage)) + } + // Choose empty text for result annotations + annotations.zip(embeddings).map { case (annotation, embedding) => + Seq( + new Annotation( + annotatorType = annotation.annotatorType, + begin = annotation.begin, + end = annotation.end, + result = annotation.result, + metadata = annotation.metadata ++ metadata, + embeddings = embedding)) } - - val result: Seq[Seq[Annotation]] = + } else { + val (completedTexts: Array[String], metadata: Map[String, String]) = + try { + (model.requestBatchCompletion(annotationsText.toArray, inferenceParams), Map.empty) + } catch { + case e: Exception => + logger.error("Error in llama.cpp batch completion", e) + (Array[String](), Map("llamacpp_exception" -> e.getMessage)) + } annotations.zip(completedTexts).map { case (annotation, text) => Seq( new Annotation( @@ -201,18 +215,9 @@ class AutoGGUFModel(override val uid: String) text, annotation.metadata ++ metadata)) } - result + } } else Seq(Seq.empty[Annotation]) } - - def getMetadataMap: Map[String, String] = { - val metadataJsonString = getMetadata - if (metadataJsonString.isEmpty) Map.empty - else { - implicit val formats: DefaultFormats.type = DefaultFormats - JsonMethods.parse(metadataJsonString).extract[Map[String, String]] - } - } } trait ReadablePretrainedAutoGGUFModel diff --git a/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala new file mode 100644 index 00000000000000..98aa10eb8b31ac --- /dev/null +++ b/src/main/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddings.scala @@ -0,0 +1,241 @@ +/* + * Copyright 2017-2024 John Snow Labs + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.johnsnowlabs.nlp.embeddings + +import com.johnsnowlabs.ml.gguf.GGUFWrapper +import com.johnsnowlabs.ml.util.LlamaCPP +import com.johnsnowlabs.nlp._ +import com.johnsnowlabs.nlp.llama.LlamaModel +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.SparkSession + +/** Annotator that uses the llama.cpp library to generate text embeddings with large language + * models. + * + * The type of embedding pooling can be set with the `setPoolingType` method. The default is + * `"MEAN"`. The available options are `"NONE"`, `"MEAN"`, `"CLS"`, and `"LAST"`. + * + * For all settable parameters, and their explanations, see [[HasLlamaCppModelProperties]]. + * + * Pretrained models can be loaded with `pretrained` of the companion object: + * {{{ + * val autoGGUFModel = AutoGGUFEmbeddings.pretrained() + * .setInputCols("document") + * .setOutputCol("embeddings") + * }}} + * The default model is `"nomic-embed-text-v1.5.Q8_0.gguf"`, if no name is provided. + * + * For available pretrained models please see the [[https://sparknlp.org/models Models Hub]]. + * + * For extended examples of usage, see the + * [[https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTest.scala AutoGGUFEmbeddingsTest]] + * and the + * [[https://github.com/JohnSnowLabs/spark-nlp/tree/master/examples/python/llama.cpp/llama.cpp_in_Spark_NLP_AutoGGUFEmbeddings.ipynb example notebook]]. + * + * ==Note== + * To use GPU inference with this annotator, make sure to use the Spark NLP GPU package and set + * the number of GPU layers with the `setNGpuLayers` method. + * + * When using larger models, we recommend adjusting GPU usage with `setNCtx` and `setNGpuLayers` + * according to your hardware to avoid out-of-memory errors. + * + * ==Example== + * + * {{{ + * import com.johnsnowlabs.nlp.base._ + * import com.johnsnowlabs.nlp.annotator._ + * import org.apache.spark.ml.Pipeline + * import spark.implicits._ + * + * val document = new DocumentAssembler().setInputCol("text").setOutputCol("document") + * + * val autoGGUFModel = AutoGGUFEmbeddings + * .pretrained() + * .setInputCols("document") + * .setOutputCol("embeddings") + * .setBatchSize(4) + * .setPoolingType("MEAN") + * + * val pipeline = new Pipeline().setStages(Array(document, autoGGUFModel)) + * + * val data = Seq( + * "The moons of Jupiter are 77 in total, with 79 confirmed natural satellites and 2 man-made ones.") + * .toDF("text") + * val result = pipeline.fit(data).transform(data) + * result.select("embeddings.embeddings").show(truncate = false) + * +--------------------------------------------------------------------------------+ + * | embeddings| + * +--------------------------------------------------------------------------------+ + * |[[-0.034486726, 0.07770534, -0.15982522, -0.017873349, 0.013914132, 0.0365736...| + * +--------------------------------------------------------------------------------+ + * }}} + * + * @param uid + * required uid for storing annotator to disk + * @groupname anno Annotator types + * @groupdesc anno + * Required input and expected output annotator types + * @groupname Ungrouped Members + * @groupname param Parameters + * @groupname setParam Parameter setters + * @groupname getParam Parameter getters + * @groupname Ungrouped Members + * @groupprio param 1 + * @groupprio anno 2 + * @groupprio Ungrouped 3 + * @groupprio setParam 4 + * @groupprio getParam 5 + * @groupdesc param + * A list of (hyper-)parameter keys this annotator can take. Users can set and get the + * parameter values through setters and getters, respectively. + */ +class AutoGGUFEmbeddings(override val uid: String) + extends AnnotatorModel[AutoGGUFEmbeddings] + with HasBatchedAnnotate[AutoGGUFEmbeddings] + with HasEngine + with HasLlamaCppModelProperties + with HasProtectedParams { + + override val inputAnnotatorTypes: Array[AnnotatorType] = Array(AnnotatorType.DOCUMENT) + override val outputAnnotatorType: AnnotatorType = AnnotatorType.SENTENCE_EMBEDDINGS + + /** Annotator reference id. Used to identify elements in metadata or to refer to this annotator + * type + */ + def this() = this(Identifiable.randomUID("AutoGGUFModel")) + + private var _model: Option[Broadcast[GGUFWrapper]] = None + + /** @group getParam */ + def getModelIfNotSet: GGUFWrapper = _model.get.value + + /** @group setParam */ + def setModelIfNotSet(spark: SparkSession, wrapper: GGUFWrapper): this.type = { + if (_model.isEmpty) { + _model = Some(spark.sparkContext.broadcast(wrapper)) + } + + setGpuSupportIfAvailable(spark) + } + + private[johnsnowlabs] def setEngine(engineName: String): this.type = set(engine, engineName) + + setDefault( + engine -> LlamaCPP.name, + embedding -> true, + poolingType -> "MEAN", + nCtx -> 4096, + nBatch -> 512) + + override def onWrite(path: String, spark: SparkSession): Unit = { + super.onWrite(path, spark) + getModelIfNotSet.saveToFile(path) + } + + /** Completes the batch of annotations. + * + * @param batchedAnnotations + * Annotations (single element arrays) in batches + * @return + * Completed text sequences + */ + override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = { + require( + getEmbedding, + "Embeddings have been manually disabled. Please enable them with setEmbedding(true).") + val annotations: Seq[Annotation] = batchedAnnotations.flatten + if (annotations.nonEmpty) { + + val modelParams = + getModelParameters.setNParallel(getBatchSize) // set parallel decoding to batch size + + val model: LlamaModel = getModelIfNotSet.getSession(modelParams) + + val annotationsText = annotations.map(_.result) + + // Return embeddings in annotation + val (embeddings: Array[Array[Float]], metadata: Map[String, String]) = + try { + (model.requestBatchEmbeddings(annotationsText.toArray), Map.empty) + } catch { + case e: Exception => + logger.error("Error in llama.cpp embeddings", e) + (Array.empty[Array[Float]], Map("llamacpp_exception" -> e.getMessage)) + } + + // Choose empty text for result annotations + annotations.zip(embeddings).map { case (annotation, embedding) => + Seq( + new Annotation( + annotatorType = annotation.annotatorType, + begin = annotation.begin, + end = annotation.end, + result = annotation.result, + metadata = annotation.metadata ++ metadata, + embeddings = embedding)) + } + } else Seq(Seq.empty[Annotation]) + } +} + +trait ReadablePretrainedAutoGGUFEmbeddings + extends ParamsAndFeaturesReadable[AutoGGUFEmbeddings] + with HasPretrained[AutoGGUFEmbeddings] { + override val defaultModelName: Some[String] = Some("nomic-embed-text-v1.5.Q8_0.gguf") + override val defaultLang: String = "en" + + /** Java compliant-overrides */ + override def pretrained(): AutoGGUFEmbeddings = super.pretrained() + + override def pretrained(name: String): AutoGGUFEmbeddings = super.pretrained(name) + + override def pretrained(name: String, lang: String): AutoGGUFEmbeddings = + super.pretrained(name, lang) + + override def pretrained(name: String, lang: String, remoteLoc: String): AutoGGUFEmbeddings = + super.pretrained(name, lang, remoteLoc) +} + +trait ReadAutoGGUFEmbeddings { + this: ParamsAndFeaturesReadable[AutoGGUFEmbeddings] => + + def readModel(instance: AutoGGUFEmbeddings, path: String, spark: SparkSession): Unit = { + val model: GGUFWrapper = GGUFWrapper.readModel(path, spark) + instance.setModelIfNotSet(spark, model) + } + + addReader(readModel) + + def loadSavedModel(modelPath: String, spark: SparkSession): AutoGGUFEmbeddings = { + // TODO potentially enable download from HF-URLS + val localPath: String = ResourceHelper.copyToLocal(modelPath) + val annotatorModel = new AutoGGUFEmbeddings() + annotatorModel + .setModelIfNotSet(spark, GGUFWrapper.read(spark, localPath)) + .setEngine(LlamaCPP.name) + + val metadata = LlamaModel.getMetadataFromFile(localPath) + if (metadata.nonEmpty) annotatorModel.setMetadata(metadata) + annotatorModel + } +} + +/** This is the companion object of [[AutoGGUFEmbeddings]]. Please refer to that class for the + * documentation. + */ +object AutoGGUFEmbeddings extends ReadablePretrainedAutoGGUFEmbeddings with ReadAutoGGUFEmbeddings diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala index 145bcc67f26b35..b359523c202f37 100644 --- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala +++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala @@ -690,7 +690,8 @@ object PythonResourceDownloader { "SnowFlakeEmbeddings" -> SnowFlakeEmbeddings, "CamemBertForZeroShotClassification" -> CamemBertForZeroShotClassification, "BertForMultipleChoice" -> BertForMultipleChoice, - "PromptAssembler" -> PromptAssembler) + "PromptAssembler" -> PromptAssembler, + "AutoGGUFEmbeddings" -> AutoGGUFEmbeddings) // List pairs of types such as the one with key type can load a pretrained model from the value type val typeMapper: Map[String, String] = Map("ZeroShotNerModel" -> "RoBertaForQuestionAnswering") diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala index b4234f24197b7c..f755b76dfa2e72 100644 --- a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala +++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/AutoGGUFModelTest.scala @@ -14,9 +14,6 @@ class AutoGGUFModelTest extends AnyFlatSpec { behavior of "AutoGGUFModelTest" - // Set Spark Debug level - ResourceHelper.spark.sparkContext.setLogLevel("INFO") - lazy val documentAssembler = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") diff --git a/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTestSpec.scala new file mode 100644 index 00000000000000..b7c4544bdbd87f --- /dev/null +++ b/src/test/scala/com/johnsnowlabs/nlp/embeddings/AutoGGUFEmbeddingsTestSpec.scala @@ -0,0 +1,86 @@ +package com.johnsnowlabs.nlp.embeddings + +import com.johnsnowlabs.nlp.Annotation +import com.johnsnowlabs.nlp.base.DocumentAssembler +import com.johnsnowlabs.nlp.util.io.ResourceHelper +import com.johnsnowlabs.tags.SlowTest +import org.apache.spark.ml.Pipeline +import org.scalatest.flatspec.AnyFlatSpec + +class AutoGGUFEmbeddingsTestSpec extends AnyFlatSpec { + import ResourceHelper.spark.implicits._ + + behavior of "AutoGGUFEmbeddings" + + lazy val documentAssembler = new DocumentAssembler() + .setInputCol("text") + .setOutputCol("document") + + lazy val data = Seq( + "The moons of Jupiter are ", // "The moons of Jupiter are 77 in total, with 79 confirmed natural satellites and 2 man-made ones. The four" + "Earth is ", // "Earth is 4.5 billion years old. It has been home to countless species, some of which have gone extinct, while others have evolved into" + "The moon is ", // "The moon is 1/400th the size of the sun. The sun is 1.39 million kilometers in diameter, while" + "The sun is " // + ).toDF("text").repartition(1) + + // nomic-embed-text-v1.5.Q8_0.gguf + def model(poolingType: String): AutoGGUFEmbeddings = AutoGGUFEmbeddings + .pretrained() + .setInputCols("document") + .setOutputCol("embeddings") + .setBatchSize(4) + .setPoolingType(poolingType) + + def pipeline(embedModel: AutoGGUFEmbeddings = model("MEAN")) = + new Pipeline().setStages(Array(documentAssembler, embedModel)) + + it should "produce embeddings" taggedAs SlowTest in { + val result = pipeline().fit(data).transform(data) + val collected = Annotation.collect(result, "embeddings") + + collected.foreach { annotations => + val embeddings = annotations.head.embeddings + assert(embeddings != null, "embeddings should not be null") + assert( + embeddings.sum > 0.0, + "embeddings should not be zero. Was there an error on llama.cpp side?") + } + } + + it should "produce embeddings of different pooling types" taggedAs SlowTest in { + def testPoolingType(poolingType: String): Unit = { + val result = pipeline(model(poolingType)).fit(data).transform(data) + val embeddings: Array[Float] = Annotation.collect(result, "embeddings").head.head.embeddings + + assert(embeddings != null, "embeddings should not be null") + assert( + embeddings.sum > 0.0, + "embeddings should not be zero. Was there an error on llama.cpp side?") + } + + Seq("NONE", "MEAN", "CLS", "LAST").foreach(testPoolingType) + } + + it should "be serializable" taggedAs SlowTest in { + + val data = Seq("Hello, I am a").toDF("text") + lazy val pipeline = new Pipeline().setStages(Array(documentAssembler, model("MEAN"))) + + val pipelineModel = pipeline.fit(data) + val savePath = "./tmp_autogguf_model" + pipelineModel.stages.last + .asInstanceOf[AutoGGUFEmbeddings] + .write + .overwrite() + .save(savePath) + + val loadedModel = AutoGGUFEmbeddings.load(savePath) + val newPipeline: Pipeline = new Pipeline().setStages(Array(documentAssembler, loadedModel)) + + newPipeline + .fit(data) + .transform(data) + .select("embeddings.embeddings") + .show(truncate = false) + } +}