From d027c5278e5ebe801cc435c09ee2bfaeaf3540b3 Mon Sep 17 00:00:00 2001 From: Zhedong Cen Date: Thu, 8 Aug 2024 16:09:30 +0800 Subject: [PATCH 1/4] add using jina deploy local llm in deploy_local_llm.mdx --- docs/guides/deploy_local_llm.mdx | 38 +++++++++++++++++++++++++------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/docs/guides/deploy_local_llm.mdx b/docs/guides/deploy_local_llm.mdx index ad817390ff3..2231e436b7c 100644 --- a/docs/guides/deploy_local_llm.mdx +++ b/docs/guides/deploy_local_llm.mdx @@ -3,17 +3,39 @@ sidebar_position: 5 slug: /deploy_local_llm --- -# Deploy a local LLM -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; +# Deploy a local model using jina -RAGFlow supports deploying models locally using Ollama or Xinference. If you have locally deployed models to leverage or wish to enable GPU or CUDA for inference acceleration, you can bind Ollama or Xinference into RAGFlow and use either of them as a local "server" for interacting with your local models. +[Jina](https://github.com/jina-ai/jina) lets you build AI services and pipelines that communicate via gRPC, HTTP and WebSockets, then scale them up and deploy to production. -RAGFlow seamlessly integrates with Ollama and Xinference, without the need for further environment configurations. You can use them to deploy two types of local models in RAGFlow: chat models and embedding models. +To deploy a local model, e.g., **gpt2**, using Jina: -:::tip NOTE -This user guide does not intend to cover much of the installation or configuration details of Ollama or Xinference; its focus is on configurations inside RAGFlow. For the most current information, you may need to check out the official site of Ollama or Xinference. -::: +### 1. Check firewall settings + +Ensure that your host machine's firewall allows inbound connections on port 12345. + +```bash +sudo ufw allow 12345/tcp +``` + +### 2.install jina package + +```bash +pip install jina +``` + +### 3. deployment local model + +Step 1: Navigate to the rag/svr directory. + +```bash +cd rag/svr +``` + +Step 2: Use Python to run the jina_server.py script and pass in the model name or the local path of the model (the script only supports loading models downloaded from Huggingface) + +```bash +python jina_server.py --model_name gpt2 +``` ## Deploy a local model using Ollama From d962dacf14b99e6db4e7685d5a0907ad46637d74 Mon Sep 17 00:00:00 2001 From: Zhedong Cen Date: Thu, 8 Aug 2024 16:13:36 +0800 Subject: [PATCH 2/4] update --- docs/guides/deploy_local_llm.mdx | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/guides/deploy_local_llm.mdx b/docs/guides/deploy_local_llm.mdx index 2231e436b7c..040e3ef9661 100644 --- a/docs/guides/deploy_local_llm.mdx +++ b/docs/guides/deploy_local_llm.mdx @@ -3,6 +3,18 @@ sidebar_position: 5 slug: /deploy_local_llm --- +# Deploy a local LLM +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +RAGFlow supports deploying models locally using Ollama or Xinference. If you have locally deployed models to leverage or wish to enable GPU or CUDA for inference acceleration, you can bind Ollama or Xinference into RAGFlow and use either of them as a local "server" for interacting with your local models. + +RAGFlow seamlessly integrates with Ollama and Xinference, without the need for further environment configurations. You can use them to deploy two types of local models in RAGFlow: chat models and embedding models. + +:::tip NOTE +This user guide does not intend to cover much of the installation or configuration details of Ollama or Xinference; its focus is on configurations inside RAGFlow. For the most current information, you may need to check out the official site of Ollama or Xinference. +::: + # Deploy a local model using jina [Jina](https://github.com/jina-ai/jina) lets you build AI services and pipelines that communicate via gRPC, HTTP and WebSockets, then scale them up and deploy to production. From 23137f0b94315b9981f05c0fe0d3edecf5cb63bd Mon Sep 17 00:00:00 2001 From: Zhedong Cen Date: Fri, 9 Aug 2024 10:04:42 +0800 Subject: [PATCH 3/4] Enhance the robustness of the code --- api/apps/llm_app.py | 2 +- rag/llm/chat_model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/api/apps/llm_app.py b/api/apps/llm_app.py index 3f5a0b7be44..6da5a20ed2a 100644 --- a/api/apps/llm_app.py +++ b/api/apps/llm_app.py @@ -131,7 +131,7 @@ def add_llm(): api_key = "xxxxxxxxxxxxxxx" elif factory == "OpenAI-API-Compatible": llm_name = req["llm_name"]+"___OpenAI-API" - api_key = req["api_key"] + api_key = req.get("api_key","xxxxxxxxxxxxxxx") else: llm_name = req["llm_name"] api_key = "xxxxxxxxxxxxxxx" diff --git a/rag/llm/chat_model.py b/rag/llm/chat_model.py index f59c4785f25..04463e931fa 100644 --- a/rag/llm/chat_model.py +++ b/rag/llm/chat_model.py @@ -72,7 +72,7 @@ def chat_streamly(self, system, history, gen_conf): + num_tokens_from_string(resp.choices[0].delta.content) ) if not hasattr(resp, "usage") or not resp.usage - else resp.usage["total_tokens"] + else resp.usage.get("total_tokens",total_tokens) ) if resp.choices[0].finish_reason == "length": ans += "...\nFor the content length reason, it stopped, continue?" if is_english( From 66fa96876498e3c9831a59acc1441560f71c3f36 Mon Sep 17 00:00:00 2001 From: Zhedong Cen Date: Fri, 9 Aug 2024 10:08:16 +0800 Subject: [PATCH 4/4] update --- api/apps/llm_app.py | 2 +- rag/llm/chat_model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/api/apps/llm_app.py b/api/apps/llm_app.py index 6da5a20ed2a..3f5a0b7be44 100644 --- a/api/apps/llm_app.py +++ b/api/apps/llm_app.py @@ -131,7 +131,7 @@ def add_llm(): api_key = "xxxxxxxxxxxxxxx" elif factory == "OpenAI-API-Compatible": llm_name = req["llm_name"]+"___OpenAI-API" - api_key = req.get("api_key","xxxxxxxxxxxxxxx") + api_key = req["api_key"] else: llm_name = req["llm_name"] api_key = "xxxxxxxxxxxxxxx" diff --git a/rag/llm/chat_model.py b/rag/llm/chat_model.py index 04463e931fa..f59c4785f25 100644 --- a/rag/llm/chat_model.py +++ b/rag/llm/chat_model.py @@ -72,7 +72,7 @@ def chat_streamly(self, system, history, gen_conf): + num_tokens_from_string(resp.choices[0].delta.content) ) if not hasattr(resp, "usage") or not resp.usage - else resp.usage.get("total_tokens",total_tokens) + else resp.usage["total_tokens"] ) if resp.choices[0].finish_reason == "length": ans += "...\nFor the content length reason, it stopped, continue?" if is_english(