Feature/vllm with chat api (#467)

modelscope · Jun 4, 2024 · 0f3e2a8 · 0f3e2a8
1 parent 569b257
commit 0f3e2a8
Show file tree

Hide file tree

Showing 4 changed files with 195 additions and 14 deletions.
diff --git a/modelscope_agent/llm/openai.py b/modelscope_agent/llm/openai.py
@@ -18,11 +18,13 @@ def __init__(self,
                  support_stream: Optional[bool] = None,
                  **kwargs):
         super().__init__(model, model_server, is_function_call)
-
-        api_base = kwargs.get('api_base', 'https://api.openai.com/v1').strip()
+        default_api_base = os.getenv('OPENAI_API_BASE',
+                                     'https://api.openai.com/v1')
+        api_base = kwargs.get('api_base', default_api_base).strip()
         api_key = kwargs.get('api_key',
                              os.getenv('OPENAI_API_KEY',
                                        default='EMPTY')).strip()
+        logger.info(f'client url {api_base}, client key: {api_key}')
         self.client = OpenAI(api_key=api_key, base_url=api_base)
         self.is_function_call = is_function_call
         self.is_chat = is_chat
@@ -154,3 +156,29 @@ def chat_with_functions(self,
                 model=self.model, messages=messages, **kwargs)
         # TODO: error handling
         return response.choices[0].message
+
+
+@register_llm('vllm')
+class Vllm(BaseChatModel):
+
+    def _chat_stream(self,
+                     messages: List[Dict],
+                     stop: Optional[List[str]] = None,
+                     **kwargs) -> Iterator[str]:
+        stop = self._update_stop_word(stop)
+        logger.info(
+            f'call openai api, model: {self.model}, messages: {str(messages)}, '
+            f'stop: {str(stop)}, stream: True, args: {str(kwargs)}')
+        response = self.client.chat.completions.create(
+            model=self.model, messages=messages, stop=stop, stream=True)
+        response = self.stat_last_call_token_info(response)
+        # TODO: error handling
+        for chunk in response:
+            # sometimes delta.content is None by vllm, we should not yield None
+            if len(chunk.choices) > 0 and hasattr(
+                    chunk.choices[0].delta,
+                    'content') and chunk.choices[0].delta.content:
+                logger.info(
+                    f'call openai api success, output: {chunk.choices[0].delta.content}'
+                )
+                yield chunk.choices[0].delta.content
diff --git a/modelscope_agent_servers/README.md b/modelscope_agent_servers/README.md
@@ -39,9 +39,12 @@ cd modelscope-agent
 sh scripts/run_assistant_server.sh
 
 # start the assistant server with specified backend
-sh scripts/run_assistant_server.sh dashscope
+sh scripts/run_assistant_server.sh --model-server dashscope
 ```
 
+# start the assistant server with specified model as vllm
+sh scripts/run_assistant_server.sh --served-model-name Qwen2-1.5B-Instruct --model path/to/weights
+
 ### Use case
 
 #### Chat
@@ -141,6 +144,84 @@ With above examples, the output should be like this:
         "usage":{"prompt_tokens":267,"completion_tokens":15,"total_tokens":282}}
 ```
 
+#### Chat with vllm
+
+User could also use the chat api with the same manner of vllm, by passing `--served-model-name` and `--model`.
+
+An using case is shown below.
+
+```shell
+sh run_script_2.sh --model_name Qwen2-1.5B-Instruct --model_dir /path/to/Qwen2-1___5B-Instruct
+```
+
+Then you could use `curl` to request this API or call python api as shown before
+
+```Shell
+curl -X POST 'http://localhost:31512/v1/chat/completions' \
+-H 'Content-Type: application/json' \
+-d '{
+    "tools": [{
+        "type": "function",
+        "function": {
+            "name": "amap_weather",
+            "description": "amap weather tool",
+            "parameters": [{
+                "name": "location",
+                "type": "string",
+                "description": "城市/区具体名称，如`北京市海淀区`请描述为`海淀区`",
+                "required": true
+            }]
+        }
+    }],
+    "tool_choice": "auto",
+    "model": "Qwen2-1.5B-Instruct",
+    "messages": [
+        {"content": "海淀区天气", "role": "user"}
+    ]
+}'
+
+```
+
+With above examples, the output should be like this:
+```shell
+{
+  "request_id": "chatcmpl_3f020464-e98d-4c7b-8717-9fca56784fe6",
+  "message": "",
+  "output": null,
+  "id": "chatcmpl_3f020464-e98d-4c7b-8717-9fca56784fe6",
+  "choices": [
+    {
+      "index": 0,
+      "message": {
+        "role": "assistant",
+        "content": "好的，我已经调用了amap_weather工具查询了海淀区的天气情况。现在，让我为您展示一下查询结果吧。\n\n工具调用\nAction: amap_weather\nAction Input: {\"location\": \"海淀区\"}\n",
+        "tool_calls": [
+          {
+            "type": "function",
+            "function": {
+              "name": "amap_weather",
+              "arguments": "{\"location\": \"海淀区\"}"
+            }
+          }
+        ]
+      },
+      "finish_reason": "tool_calls"
+    }
+  ],
+  "created": 1717485704,
+  "model": "Qwen2-1.5B-Instruct",
+  "system_fingerprint": "chatcmpl_3f020464-e98d-4c7b-8717-9fca56784fe6",
+  "object": "chat.completion",
+  "usage": {
+    "prompt_tokens": 237,
+    "completion_tokens": 48,
+    "total_tokens": 285
+  }
+}
+```
+
+
+
 #### Assistant
 
 To interact with the chat API, you should construct a object like `AgentRequest` on the client side, and then use the requests library to send it as the request body.

diff --git a/modelscope_agent_servers/assistant_server/api.py b/modelscope_agent_servers/assistant_server/api.py
@@ -16,7 +16,6 @@
 DEFAULT_KNOWLEDGE_PATH = 'knowledges'
 DEFAULT_INDEX_PATH = 'index'
 
-model_server = os.environ.get('MODEL_SERVER', 'dashscope')
 app = FastAPI()
 
 
@@ -129,12 +128,12 @@ async def chat_completion(chat_request: ChatCompletionRequest,
     user = chat_request.user
     model = chat_request.model
     # remove the prefix 'Bearer ' from the authorization header
-    auth = authorization[7:] if authorization else ''
+    auth = authorization[7:] if authorization else 'EMPTY'
 
     # llm_config
     llm_config = {
         'model': model,
-        'model_server': model_server,
+        'model_server': os.environ.get('MODEL_SERVER', 'dashscope'),
         'api_key': auth
     }
 

diff --git a/scripts/run_assistant_server.sh b/scripts/run_assistant_server.sh
@@ -4,18 +4,91 @@
 echo "Installing dependencies from requirements.txt..."
 pip3 install -r modelscope_agent_servers/requirements.txt
 
+# Initialize optional variables with empty strings as default values
+MODEL_DIR=""
+MODEL_SERVER=""
+MODEL_NAME=""
+
+# Loop through arguments and process them
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --served-model-name)
+        MODEL_NAME="$2"
+        shift # past argument
+        shift # past value
+        ;;
+        --model)
+        MODEL_DIR="$2"
+        shift # past argument
+        shift # past value
+        ;;
+        --model-server)
+        MODEL_SERVER="$2"
+        shift # past argument
+        shift # past value
+        ;;
+        *)    # unknown option
+        shift # past argument
+        ;;
+    esac
+done
+
+# Optionally, echo variables for debugging or confirmation
+echo "Model name: $MODEL_NAME"
+echo "Model directory: $MODEL_DIR"
+echo "Model server: $MODEL_SERVER"
+
 
 # running
 echo "Running fastapi assistant server at port 31512."
 export PYTHONPATH=$PYTHONPATH:modelscope_agent_servers
 
-if [ -z "$1" ]; then
-    export MODEL_SERVER=dashscope
-else
-    export MODEL_SERVER=$1
-fi
+if [ "$MODEL_DIR" != "" ]; then
+    echo "Running vllm server, please make sure install vllm"
+    # Start the first server in the background on port 8000
+    python -m vllm.entrypoints.openai.api_server --served-model-name $MODEL_NAME --model $MODEL_DIR  & SERVER_1_PID=$!
+    export MODEL_SERVER=vllm
+    export OPENAI_API_BASE=http://localhost:8000/v1
+    echo "Model server: $MODEL_SERVER"
+    echo "OPENAI_API_BASE: $OPENAI_API_BASE"
+
+    # Function to check if the first server is up
+    check_first_server() {
+        echo "Checking if Server 1 is up..."
+        for i in {1..10}; do # try up to 10 times
+            curl -s http://localhost:8000 > /dev/null
+            if [ $? -eq 0 ]; then
+                echo "Server 1 is up and running."
+                return 0
+            else
+                echo "Server 1 is not ready yet. Retrying..."
+                sleep 4
+            fi
+        done
+        return 1
+    }
 
-if [ "$MODEL_SERVER" == "ollama" ]; then
-    ollama serve
+    # Wait for the first server to be up
+    if check_first_server; then
+        # Start the second server on port 8001
+        echo "Starting Server 2..."
+        uvicorn modelscope_agent_servers.assistant_server.api:app --host 0.0.0.0 --port 31512 & $SERVER_2_PID
+    else
+        echo "Failed to start Server 1."
+        exit 1
+    fi
+    # Kill the first server when the second server is stopped
+    wait $SERVER_1_PID
+    wait $SERVER_2_PID
 
-uvicorn modelscope_agent_servers.assistant_server.api:app --host 0.0.0.0 --port 31512
+elif [ -n "$MODEL_SERVER" ]; then
+    echo "Running specified model server: $MODEL_SERVER..."
+    if [ "$MODEL_SERVER" == "ollama" ]; then
+      ollama serve
+    fi
+    uvicorn modelscope_agent_servers.assistant_server.api:app --host 0.0.0.0 --port 31512
+else
+    MODEL_SERVER=dashscope
+    echo "Running FastAPI assistant server at port 31512 as default."
+    uvicorn modelscope_agent_servers.assistant_server.api:app --host 0.0.0.0 --port 31512
+fi