feat(llm): adds serveral settings for llamacpp and ollama (#1703)

zylon-ai · Mar 11, 2024 · 02dc83e · 02dc83e
1 parent 410bf7a
commit 02dc83e
Show file tree

Hide file tree

Showing 10 changed files with 91 additions and 8 deletions.
diff --git a/private_gpt/__init__.py b/private_gpt/__init__.py
@@ -1,4 +1,5 @@
 """private-gpt."""
+
 import logging
 import os
 

diff --git a/private_gpt/components/llm/llm_component.py b/private_gpt/components/llm/llm_component.py
@@ -39,16 +39,23 @@ def __init__(self, settings: Settings) -> None:
                     ) from e
 
                 prompt_style = get_prompt_style(settings.llamacpp.prompt_style)
-
+                settings_kwargs = {
+                    "tfs_z": settings.llamacpp.tfs_z,  # ollama and llama-cpp
+                    "top_k": settings.llamacpp.top_k,  # ollama and llama-cpp
+                    "top_p": settings.llamacpp.top_p,  # ollama and llama-cpp
+                    "repeat_penalty": settings.llamacpp.repeat_penalty,  # ollama llama-cpp
+                    "n_gpu_layers": -1,
+                    "offload_kqv": True,
+                }
                 self.llm = LlamaCPP(
                     model_path=str(models_path / settings.llamacpp.llm_hf_model_file),
-                    temperature=0.1,
+                    temperature=settings.llm.temperature,
                     max_new_tokens=settings.llm.max_new_tokens,
                     context_window=settings.llm.context_window,
                     generate_kwargs={},
                     callback_manager=LlamaIndexSettings.callback_manager,
                     # All to GPU
-                    model_kwargs={"n_gpu_layers": -1, "offload_kqv": True},
+                    model_kwargs=settings_kwargs,
                     # transform inputs into Llama2 format
                     messages_to_prompt=prompt_style.messages_to_prompt,
                     completion_to_prompt=prompt_style.completion_to_prompt,
@@ -108,8 +115,22 @@ def __init__(self, settings: Settings) -> None:
                     ) from e
 
                 ollama_settings = settings.ollama
+
+                settings_kwargs = {
+                    "tfs_z": ollama_settings.tfs_z,  # ollama and llama-cpp
+                    "num_predict": ollama_settings.num_predict,  # ollama only
+                    "top_k": ollama_settings.top_k,  # ollama and llama-cpp
+                    "top_p": ollama_settings.top_p,  # ollama and llama-cpp
+                    "repeat_last_n": ollama_settings.repeat_last_n,  # ollama
+                    "repeat_penalty": ollama_settings.repeat_penalty,  # ollama llama-cpp
+                }
+
                 self.llm = Ollama(
-                    model=ollama_settings.llm_model, base_url=ollama_settings.api_base
+                    model=ollama_settings.llm_model,
+                    base_url=ollama_settings.api_base,
+                    temperature=settings.llm.temperature,
+                    context_window=settings.llm.context_window,
+                    additional_kwargs=settings_kwargs,
                 )
             case "mock":
                 self.llm = MockLLM()
diff --git a/private_gpt/components/vector_store/vector_store_component.py b/private_gpt/components/vector_store/vector_store_component.py
@@ -137,9 +137,11 @@ def get_retriever(
             index=index,
             similarity_top_k=similarity_top_k,
             doc_ids=context_filter.docs_ids if context_filter else None,
-            filters=_doc_id_metadata_filter(context_filter)
-            if self.settings.vectorstore.database != "qdrant"
-            else None,
+            filters=(
+                _doc_id_metadata_filter(context_filter)
+                if self.settings.vectorstore.database != "qdrant"
+                else None
+            ),
         )
 
     def close(self) -> None:

diff --git a/private_gpt/launcher.py b/private_gpt/launcher.py
@@ -1,4 +1,5 @@
 """FastAPI app creation, logger configuration and main API routes."""
+
 import logging
 
 from fastapi import Depends, FastAPI, Request

diff --git a/private_gpt/server/utils/auth.py b/private_gpt/server/utils/auth.py
@@ -12,6 +12,7 @@
 * https://fastapi.tiangolo.com/tutorial/security/
 * https://fastapi.tiangolo.com/tutorial/dependencies/dependencies-in-path-operation-decorators/
 """
+
 # mypy: ignore-errors
 # Disabled mypy error: All conditional function variants must have identical signatures
 # We are changing the implementation of the authenticated method, based on

diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py
@@ -98,6 +98,10 @@ class LLMSettings(BaseModel):
         "like `HuggingFaceH4/zephyr-7b-beta`. If not set, will load a tokenizer matching "
         "gpt-3.5-turbo LLM.",
     )
+    temperature: float = Field(
+        0.1,
+        description="The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual.",
+    )
 
 
 class VectorstoreSettings(BaseModel):
@@ -119,6 +123,23 @@ class LlamaCPPSettings(BaseModel):
         ),
     )
 
+    tfs_z: float = Field(
+        1.0,
+        description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.",
+    )
+    top_k: int = Field(
+        40,
+        description="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)",
+    )
+    top_p: float = Field(
+        0.9,
+        description="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)",
+    )
+    repeat_penalty: float = Field(
+        1.1,
+        description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
+    )
+
 
 class HuggingFaceSettings(BaseModel):
     embedding_hf_model_name: str = Field(
@@ -184,6 +205,30 @@ class OllamaSettings(BaseModel):
         None,
         description="Model to use. Example: 'nomic-embed-text'.",
     )
+    tfs_z: float = Field(
+        1.0,
+        description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.",
+    )
+    num_predict: int = Field(
+        None,
+        description="Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)",
+    )
+    top_k: int = Field(
+        40,
+        description="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)",
+    )
+    top_p: float = Field(
+        0.9,
+        description="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)",
+    )
+    repeat_last_n: int = Field(
+        64,
+        description="Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)",
+    )
+    repeat_penalty: float = Field(
+        1.1,
+        description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
+    )
 
 
 class UISettings(BaseModel):

diff --git a/private_gpt/ui/ui.py b/private_gpt/ui/ui.py
@@ -1,4 +1,5 @@
 """This file should be imported only and only if you want to run the UI locally."""
+
 import itertools
 import logging
 import time

diff --git a/settings-ollama.yaml b/settings-ollama.yaml
@@ -5,6 +5,7 @@ llm:
   mode: ollama
   max_new_tokens: 512
   context_window: 3900
+  temperature: 0.1     #The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
 
 embedding:
   mode: ollama
@@ -13,10 +14,14 @@ ollama:
   llm_model: mistral
   embedding_model: nomic-embed-text
   api_base: http://localhost:11434
+  tfs_z: 1.0            # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.
+  top_k: 40             # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
+  top_p: 0.9            # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
+  repeat_last_n: 64     # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
+  repeat_penalty: 1.2   # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
 
 vectorstore:
   database: qdrant
 
 qdrant:
   path: local_data/private_gpt/qdrant
-
diff --git a/settings.yaml b/settings.yaml
@@ -39,11 +39,16 @@ llm:
   # Should be matching the selected model
   max_new_tokens: 512
   context_window: 3900
+  temperature: 0.1      # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
 
 llamacpp:
   prompt_style: "mistral"
   llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
   llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf
+  tfs_z: 1.0            # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting
+  top_k: 40             # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
+  top_p: 1.0            # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
+  repeat_penalty: 1.1   # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
 
 embedding:
   # Should be matching the value above in most cases

diff --git a/tests/server/utils/test_simple_auth.py b/tests/server/utils/test_simple_auth.py
@@ -5,6 +5,7 @@
       is currently architecture (it is hard to patch the `settings` and the app while
       the tests are directly importing them).
 """
+
 from typing import Annotated
 
 import pytest