From 02dc83e8e9f7ada181ff813f25051bbdff7b7c6b Mon Sep 17 00:00:00 2001 From: icsy7867 Date: Mon, 11 Mar 2024 17:51:05 -0400 Subject: [PATCH] feat(llm): adds serveral settings for llamacpp and ollama (#1703) --- private_gpt/__init__.py | 1 + private_gpt/components/llm/llm_component.py | 29 ++++++++++-- .../vector_store/vector_store_component.py | 8 ++-- private_gpt/launcher.py | 1 + private_gpt/server/utils/auth.py | 1 + private_gpt/settings/settings.py | 45 +++++++++++++++++++ private_gpt/ui/ui.py | 1 + settings-ollama.yaml | 7 ++- settings.yaml | 5 +++ tests/server/utils/test_simple_auth.py | 1 + 10 files changed, 91 insertions(+), 8 deletions(-) diff --git a/private_gpt/__init__.py b/private_gpt/__init__.py index 995365c8d..d4f1db4f5 100644 --- a/private_gpt/__init__.py +++ b/private_gpt/__init__.py @@ -1,4 +1,5 @@ """private-gpt.""" + import logging import os diff --git a/private_gpt/components/llm/llm_component.py b/private_gpt/components/llm/llm_component.py index 351513e46..d4e13a585 100644 --- a/private_gpt/components/llm/llm_component.py +++ b/private_gpt/components/llm/llm_component.py @@ -39,16 +39,23 @@ def __init__(self, settings: Settings) -> None: ) from e prompt_style = get_prompt_style(settings.llamacpp.prompt_style) - + settings_kwargs = { + "tfs_z": settings.llamacpp.tfs_z, # ollama and llama-cpp + "top_k": settings.llamacpp.top_k, # ollama and llama-cpp + "top_p": settings.llamacpp.top_p, # ollama and llama-cpp + "repeat_penalty": settings.llamacpp.repeat_penalty, # ollama llama-cpp + "n_gpu_layers": -1, + "offload_kqv": True, + } self.llm = LlamaCPP( model_path=str(models_path / settings.llamacpp.llm_hf_model_file), - temperature=0.1, + temperature=settings.llm.temperature, max_new_tokens=settings.llm.max_new_tokens, context_window=settings.llm.context_window, generate_kwargs={}, callback_manager=LlamaIndexSettings.callback_manager, # All to GPU - model_kwargs={"n_gpu_layers": -1, "offload_kqv": True}, + model_kwargs=settings_kwargs, # transform inputs into Llama2 format messages_to_prompt=prompt_style.messages_to_prompt, completion_to_prompt=prompt_style.completion_to_prompt, @@ -108,8 +115,22 @@ def __init__(self, settings: Settings) -> None: ) from e ollama_settings = settings.ollama + + settings_kwargs = { + "tfs_z": ollama_settings.tfs_z, # ollama and llama-cpp + "num_predict": ollama_settings.num_predict, # ollama only + "top_k": ollama_settings.top_k, # ollama and llama-cpp + "top_p": ollama_settings.top_p, # ollama and llama-cpp + "repeat_last_n": ollama_settings.repeat_last_n, # ollama + "repeat_penalty": ollama_settings.repeat_penalty, # ollama llama-cpp + } + self.llm = Ollama( - model=ollama_settings.llm_model, base_url=ollama_settings.api_base + model=ollama_settings.llm_model, + base_url=ollama_settings.api_base, + temperature=settings.llm.temperature, + context_window=settings.llm.context_window, + additional_kwargs=settings_kwargs, ) case "mock": self.llm = MockLLM() diff --git a/private_gpt/components/vector_store/vector_store_component.py b/private_gpt/components/vector_store/vector_store_component.py index 0b677c875..5641f25e7 100644 --- a/private_gpt/components/vector_store/vector_store_component.py +++ b/private_gpt/components/vector_store/vector_store_component.py @@ -137,9 +137,11 @@ def get_retriever( index=index, similarity_top_k=similarity_top_k, doc_ids=context_filter.docs_ids if context_filter else None, - filters=_doc_id_metadata_filter(context_filter) - if self.settings.vectorstore.database != "qdrant" - else None, + filters=( + _doc_id_metadata_filter(context_filter) + if self.settings.vectorstore.database != "qdrant" + else None + ), ) def close(self) -> None: diff --git a/private_gpt/launcher.py b/private_gpt/launcher.py index 5cce8c72a..43bd803a5 100644 --- a/private_gpt/launcher.py +++ b/private_gpt/launcher.py @@ -1,4 +1,5 @@ """FastAPI app creation, logger configuration and main API routes.""" + import logging from fastapi import Depends, FastAPI, Request diff --git a/private_gpt/server/utils/auth.py b/private_gpt/server/utils/auth.py index 4fd57a7fe..2eb40fe56 100644 --- a/private_gpt/server/utils/auth.py +++ b/private_gpt/server/utils/auth.py @@ -12,6 +12,7 @@ * https://fastapi.tiangolo.com/tutorial/security/ * https://fastapi.tiangolo.com/tutorial/dependencies/dependencies-in-path-operation-decorators/ """ + # mypy: ignore-errors # Disabled mypy error: All conditional function variants must have identical signatures # We are changing the implementation of the authenticated method, based on diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py index cbb890237..62af3f341 100644 --- a/private_gpt/settings/settings.py +++ b/private_gpt/settings/settings.py @@ -98,6 +98,10 @@ class LLMSettings(BaseModel): "like `HuggingFaceH4/zephyr-7b-beta`. If not set, will load a tokenizer matching " "gpt-3.5-turbo LLM.", ) + temperature: float = Field( + 0.1, + description="The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual.", + ) class VectorstoreSettings(BaseModel): @@ -119,6 +123,23 @@ class LlamaCPPSettings(BaseModel): ), ) + tfs_z: float = Field( + 1.0, + description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.", + ) + top_k: int = Field( + 40, + description="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)", + ) + top_p: float = Field( + 0.9, + description="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)", + ) + repeat_penalty: float = Field( + 1.1, + description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)", + ) + class HuggingFaceSettings(BaseModel): embedding_hf_model_name: str = Field( @@ -184,6 +205,30 @@ class OllamaSettings(BaseModel): None, description="Model to use. Example: 'nomic-embed-text'.", ) + tfs_z: float = Field( + 1.0, + description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.", + ) + num_predict: int = Field( + None, + description="Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)", + ) + top_k: int = Field( + 40, + description="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)", + ) + top_p: float = Field( + 0.9, + description="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)", + ) + repeat_last_n: int = Field( + 64, + description="Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)", + ) + repeat_penalty: float = Field( + 1.1, + description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)", + ) class UISettings(BaseModel): diff --git a/private_gpt/ui/ui.py b/private_gpt/ui/ui.py index 5d0f9925f..7c34e8497 100644 --- a/private_gpt/ui/ui.py +++ b/private_gpt/ui/ui.py @@ -1,4 +1,5 @@ """This file should be imported only and only if you want to run the UI locally.""" + import itertools import logging import time diff --git a/settings-ollama.yaml b/settings-ollama.yaml index 4f2cab4d8..9a0aaed0a 100644 --- a/settings-ollama.yaml +++ b/settings-ollama.yaml @@ -5,6 +5,7 @@ llm: mode: ollama max_new_tokens: 512 context_window: 3900 + temperature: 0.1 #The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1) embedding: mode: ollama @@ -13,10 +14,14 @@ ollama: llm_model: mistral embedding_model: nomic-embed-text api_base: http://localhost:11434 + tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. + top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) + top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) + repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) + repeat_penalty: 1.2 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) vectorstore: database: qdrant qdrant: path: local_data/private_gpt/qdrant - diff --git a/settings.yaml b/settings.yaml index 9d3cd0737..a9a676bdb 100644 --- a/settings.yaml +++ b/settings.yaml @@ -39,11 +39,16 @@ llm: # Should be matching the selected model max_new_tokens: 512 context_window: 3900 + temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1) llamacpp: prompt_style: "mistral" llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf + tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting + top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40) + top_p: 1.0 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9) + repeat_penalty: 1.1 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1) embedding: # Should be matching the value above in most cases diff --git a/tests/server/utils/test_simple_auth.py b/tests/server/utils/test_simple_auth.py index 0ef3614cc..e79dd26e4 100644 --- a/tests/server/utils/test_simple_auth.py +++ b/tests/server/utils/test_simple_auth.py @@ -5,6 +5,7 @@ is currently architecture (it is hard to patch the `settings` and the app while the tests are directly importing them). """ + from typing import Annotated import pytest