[NeuralChat] Fix PC codegen streaming issue and set 'Intel/neural-cha…

…t-7b-v3-1' as default model (#920)
intel · Dec 14, 2023 · 0f0bf22 · 0f0bf22
1 parent 3562889
commit 0f0bf22
Show file tree

Hide file tree

Showing 18 changed files with 40 additions and 51 deletions.
diff --git a/intel_extension_for_transformers/neural_chat/README.md b/intel_extension_for_transformers/neural_chat/README.md
@@ -108,7 +108,7 @@ NeuralChat supports fine-tuning the pretrained large language model (LLM) for te
 
 ```shell
 # Command line
-neuralchat finetune --base_model "meta-llama/Llama-2-7b-chat-hf" --config pipeline/finetuning/config/finetuning.yaml
+neuralchat finetune --base_model "Intel/neural-chat-7b-v3-1" --config pipeline/finetuning/config/finetuning.yaml
 ```
 
 ```python
@@ -124,7 +124,7 @@ NeuralChat provides typical model optimization technologies, like `Automatic Mix
 
 ```shell
 # Command line
-neuralchat optimize --base_model "meta-llama/Llama-2-7b-chat-hf" --config pipeline/optimization/config/optimization.yaml
+neuralchat optimize --base_model "Intel/neural-chat-7b-v3-1" --config pipeline/optimization/config/optimization.yaml
 ```
 
 ```python

diff --git a/intel_extension_for_transformers/neural_chat/config.py b/intel_extension_for_transformers/neural_chat/config.py
@@ -412,7 +412,7 @@ class LoadingModelConfig:
 
 class PipelineConfig:
     def __init__(self,
-                 model_name_or_path="meta-llama/Llama-2-7b-chat-hf",
+                 model_name_or_path="Intel/neural-chat-7b-v3-1",
                  tokenizer_name_or_path=None,
                  hf_access_token=None,
                  device="auto",

diff --git a/...ion_for_transformers/neural_chat/examples/deployment/photo_ai/backend/README.md b/...ion_for_transformers/neural_chat/examples/deployment/photo_ai/backend/README.md
@@ -45,8 +45,8 @@ pip install -r ../../../requirements.txt
 ## Install Models
 ```shell
 git-lfs install
-# download llama-2 model for NER plugin
-git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+# download neural-chat-7b-v3-1 model for NER plugin
+git clone https://huggingface.co/Intel/neural-chat-7b-v3-1
 # download spacy model for NER post process
 python -m spacy download en_core_web_lg
 ```
@@ -83,7 +83,7 @@ You can customize the configuration file `photoai.yaml` to match your environmen
 | ------------------- | ---------------------------------------|
 | host                | 127.0.0.1                              |
 | port                | 9000                                   |
-| model_name_or_path  | "./Llama-2-7b-chat-hf"        |
+| model_name_or_path  | "./neural-chat-7b-v3-1"        |
 | device              | "auto"                                  |
 | asr.enable          | true                                   |
 | tts.enable          | true                                   |

diff --git a/intel_extension_for_transformers/neural_chat/examples/deployment/rag/README.md b/intel_extension_for_transformers/neural_chat/examples/deployment/rag/README.md
@@ -46,7 +46,7 @@ pip install -r ../../../requirements.txt
 ## Download Models
 ```shell
 git-lfs install
-git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+git clone https://huggingface.co/Intel/neural-chat-7b-v3-1
 ```
 
 
@@ -58,7 +58,7 @@ You can customize the configuration file 'askdoc.yaml' to match your environment
 | --------------------------------- | ---------------------------------------|
 | host                              | 127.0.0.1                              |
 | port                              | 8000                                   |
-| model_name_or_path                | "./Llama-2-7b-chat-hf"                 |
+| model_name_or_path                | "./neural-chat-7b-v3-1"                 |
 | device                            | "auto"                                 |
 | retrieval.enable                  | true                                   |
 | retrieval.args.input_path         | "./docs"                               |

diff --git a/...ransformers/neural_chat/examples/deployment/talkingbot/server/backend/README.md b/...ransformers/neural_chat/examples/deployment/talkingbot/server/backend/README.md
@@ -46,7 +46,7 @@ You can customize the configuration file 'textbot.yaml' to match your environmen
 | --------------------- | --------------------------------------- |
 | host                  | 127.0.0.1                                |
 | port                  | 8888                                     |
-| model_name_or_path    | "meta-llama/Llama-2-7b-chat-hf"          |
+| model_name_or_path    | "Intel/neural-chat-7b-v3-1"          |
 | device                | "cpu"                                    |
 | asr.enable            | true                                     |
 | asr.args.device       | "cpu"                                    |

diff --git a/...or_transformers/neural_chat/examples/deployment/talkingbot/server/backend/talkingbot.yaml b/...or_transformers/neural_chat/examples/deployment/talkingbot/server/backend/talkingbot.yaml
@@ -23,7 +23,7 @@
 host: 0.0.0.0
 port: 8888
 
-model_name_or_path: "meta-llama/Llama-2-7b-chat-hf"
+model_name_or_path: "Intel/neural-chat-7b-v3-1"
 device: "cpu"
 
 asr:

diff --git a/...for_transformers/neural_chat/examples/deployment/textbot/backend/xeon/README.md b/...for_transformers/neural_chat/examples/deployment/textbot/backend/xeon/README.md
@@ -46,7 +46,7 @@ You can customize the configuration file 'textbot.yaml' to match your environmen
 | ------------------- | --------------------------------------- |
 | host                | 127.0.0.1                              |
 | port                | 8000                                   |
-| model_name_or_path  | "meta-llama/Llama-2-7b-chat-hf"        |
+| model_name_or_path  | "Intel/neural-chat-7b-v3-1"        |
 | device              | "cpu"                                  |
 | tasks_list          | ['textchat']                           |
 

diff --git a/...ension_for_transformers/neural_chat/examples/deployment/textbot/backend/xeon/textbot.yaml b/...ension_for_transformers/neural_chat/examples/deployment/textbot/backend/xeon/textbot.yaml
@@ -23,7 +23,7 @@
 host: 0.0.0.0
 port: 8000
 
-model_name_or_path: "meta-llama/Llama-2-7b-chat-hf"
+model_name_or_path: "Intel/neural-chat-7b-v3-1"
 device: "cpu"
 
 # users can choose one of the ipex int8, itrex int4, mix precision and

diff --git a/...ansformers/neural_chat/examples/deployment/textbot/backend_with_cache/README.md b/...ansformers/neural_chat/examples/deployment/textbot/backend_with_cache/README.md
@@ -40,13 +40,13 @@ pip install -r ../../../requirements.txt
 You can customize the configuration file 'textbot.yaml' to match your environment setup. Here's a table to help you understand the configurable options:
 
 |  Item              | Value                                      |
-| ------------------- | --------------------------------------- |
-| host                | 127.0.0.1                              |
-| port                | 8000                                   |
-| model_name_or_path  | "meta-llama/Llama-2-7b-chat-hf"        |
-| device              | "cpu"                                  |
-| cache.enable        | true                                  |
-| tasks_list          | ['textchat']                           |
+| ------------------- | ----------------------------------------- |
+| host                | 127.0.0.1                                 |
+| port                | 8000                                      |
+| model_name_or_path  | "Intel/neural-chat-7b-v3-1"               |
+| device              | "cpu"                                     |
+| cache.enable        | true                                      |
+| tasks_list          | ['textchat']                              |
 
 
 

diff --git a/intel_extension_for_transformers/neural_chat/models/model_utils.py b/intel_extension_for_transformers/neural_chat/models/model_utils.py
@@ -744,7 +744,7 @@ def predict_stream(**params):
         `num_beams` (int): Controls the number of beams used in beam search.
                            Higher values increase the diversity but also the computation time.
         `model_name` (string): Specifies the name of the pre-trained model to use for text generation.
-                               If not provided, the default model is "mosaicml/mpt-7b-chat".
+                               If not provided, the default model is "Intel/neural-chat-7b-v3-1".
         `num_return_sequences` (int): Specifies the number of alternative sequences to generate.
         `bad_words_ids` (list or None): Contains a list of token IDs that should not appear in the generated text.
         `force_words_ids` (list or None): Contains a list of token IDs that must be included in the generated text.
@@ -770,7 +770,7 @@ def predict_stream(**params):
     do_sample = params["do_sample"] if "do_sample" in params else True
     num_beams = int(params["num_beams"]) if "num_beams" in params else 0
     model_name = (
-        params["model_name"] if "model_name" in params else "mosaicml/mpt-7b-chat"
+        params["model_name"] if "model_name" in params else "Intel/neural-chat-7b-v3-1"
     )
     num_return_sequences = (
         params["num_return_sequences"] if "num_return_sequences" in params else 1
@@ -791,7 +791,9 @@ def predict_stream(**params):
 
     if is_llm_runtime_model(model):
         prompt = remove_prompt_history(model_name, prompt)
-        max_new_tokens = max_new_tokens if max_new_tokens > 1024 else 1024
+        max_new_tokens = max_new_tokens if (max_new_tokens > 1024 or \
+                                            "codellama" in model_name.lower() or \
+                                            "starcoder" in model_name.lower()) else 1024
 
     streamer = TextIteratorStreamer(
         tokenizer, skip_prompt=True, skip_special_tokens=True
@@ -1035,7 +1037,9 @@ def predict(**params):
 
     if is_llm_runtime_model(model):
         prompt = remove_prompt_history(model_name, prompt)
-        max_new_tokens = max_new_tokens if max_new_tokens > 1024 else 1024
+        max_new_tokens = max_new_tokens if (max_new_tokens > 1024 or \
+                                            "codellama" in model_name.lower() or \
+                                            "starcoder" in model_name.lower()) else 1024
 
     if num_beams == 0:
         num_beams = 1

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/ner/README.md b/intel_extension_for_transformers/neural_chat/pipeline/plugins/ner/README.md
@@ -54,15 +54,15 @@ print("NER result: ", result)
 ## Plugin Parameters
 You can costomize the NER inference parameters to meet the personal demands for better performance. You can set the specific parameter by `plugins.ner.args["xxx"]`. Below are the descriptions of each available parameters.
 ```python
-model_name_or_path [str]: The huggingface model name or local path of the downloaded llm model. Default to "./Llama-2-7b-chat-hf/".
+model_name_or_path [str]: The huggingface model name or local path of the downloaded llm model. Default to "./neural-chat-7b-v3-1/".
 
 spacy_model [str]: The Spacy model for NLP process, specify it according to the downloaded Spacy model. Default to "en_core_web_lg".
 
 bf16 [bool]: Choose wether to use BF16 precision for NER inference. Default to False.
 ```
 As for INT8 and INT4 model the plugin parameters are slightly different. You can set the specific parameter by `plugins.ner_int.args["xxx"]`.
 ```python
-model_name_or_path [str]: The huggingface model name or local path of the downloaded llm model. Default to "./Llama-2-7b-chat-hf/".
+model_name_or_path [str]: The huggingface model name or local path of the downloaded llm model. Default to "./neural-chat-7b-v3-1/".
 
 spacy_model [str]: The Spacy model for NLP process, specify it according to the downloaded Spacy model. Default to "en_core_web_lg".
 

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/ner/ner_int.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/ner/ner_int.py
@@ -44,7 +44,7 @@ class NamedEntityRecognitionINT():
     """
 
     def __init__(self, 
-                 model_path="meta-llama/Llama-2-7b-chat-hf", 
+                 model_path="Intel/neural-chat-7b-v3-1", 
                  spacy_model="en_core_web_lg", 
                  compute_dtype="fp32", 
                  weight_dtype="int8",

diff --git a/intel_extension_for_transformers/neural_chat/server/README.md b/intel_extension_for_transformers/neural_chat/server/README.md
@@ -16,7 +16,7 @@ NeuralChat provides a default chatbot configuration in `./config/neuralchat.yaml
 | ------------------------- | ------------------------ | --------------------------------------- | --------------------------------- |
 | host                      |                          | 0.0.0.0                                 | Any valid IP address              |
 | port                      |                          | 8000                                    | Any valid port number             |
-| model_name_or_path        |                          | "meta-llama/Llama-2-7b-chat-hf"         | A valid model name or path        |
+| model_name_or_path        |                          | "Intel/neural-chat-7b-v3-1"         | A valid model name or path        |
 | tokenizer_name_or_path    |                          | ""                                      | A tokenizer name or path          |
 | peft_model_path           |                          | ""                                      | A peft model path                 |
 | device                    |                          | "auto"                                  | "cpu", "hpu", "xpu", "cuda"       |
@@ -47,11 +47,11 @@ NeuralChat provides a default chatbot configuration in `./config/neuralchat.yaml
 |                           | args.embedding_model_dir | "hkunlp/instructor-large"              | A valid directory path             |
 | safety_checker            | enable                   | false                                   | true, false                       |
 | ner                       | enable                   | false                                   | true, false                       |
-|                           | args.model_path          | "meta-llama/Llama-2-7b-chat-hf"        | A valid directory path of llm model   |
+|                           | args.model_path          | "Intel/neural-chat-7b-v3-1"        | A valid directory path of llm model   |
 |                           | args.spacy_model         | "en_core_web_lg"                       | A valid name of downloaded spacy model      |
 |                           | args.bf16                | false                                   | true, false                          |
 | ner_int                   | enable                   | false                                   | true, false                          |
-|                           | args.model_path          | "meta-llama/Llama-2-7b-chat-hf"        | A valid directory path of llm model      |
+|                           | args.model_path          | "Intel/neural-chat-7b-v3-1"        | A valid directory path of llm model      |
 |                           | args.spacy_model         | "en_core_web_lg"                       | A valid name of downloaded spacy model   |
 |                           | args.compute_dtype       | "fp32"                                  | "fp32", "int8"                       |
 |                           | args.weight_dtype        | "int8"                                  | "int8", "int4"                       |

diff --git a/intel_extension_for_transformers/neural_chat/server/config/neuralchat.yaml b/intel_extension_for_transformers/neural_chat/server/config/neuralchat.yaml
@@ -23,7 +23,7 @@
 host: 0.0.0.0
 port: 8000
 
-model_name_or_path: "meta-llama/Llama-2-7b-chat-hf"
+model_name_or_path: "Intel/neural-chat-7b-v3-1"
 # tokenizer_name_or_path: ""
 # peft_model_path: ""
 device: "auto"
@@ -80,15 +80,15 @@ ner:
     enable: false
     args:
         device: "cpu"
-        model_path: "meta-llama/Llama-2-7b-chat-hf"
+        model_path: "Intel/neural-chat-7b-v3-1"
         spacy_model: "en_core_web_lg"
         bf16: False
 
 ner_int:
     enable: false
     args:
         device: "cpu"
-        model_path: "meta-llama/Llama-2-7b-chat-hf"
+        model_path: "Intel/neural-chat-7b-v3-1"
         spacy_model: "en_core_web_lg"
         compute_dtype: "fp32"
         weight_dtype: "int8"

diff --git a/intel_extension_for_transformers/neural_chat/server/restful/codegen_api.py b/intel_extension_for_transformers/neural_chat/server/restful/codegen_api.py
@@ -100,22 +100,7 @@ def handle_chat_completion_request(self, request: ChatCompletionRequest):
                 def stream_generator():
                     nonlocal buffered_texts
                     for output in generator:
-                        if isinstance(output, str):
-                            chunks = output.split()
-                            for chunk in chunks:
-                                ret = {
-                                    "text": chunk,
-                                    "error_code": 0,
-                                }
-                                buffered_texts += chunk + ' '
-                                yield json.dumps(ret).encode() + b"\0"
-                        else:
-                            ret = {
-                                "text": output,
-                                "error_code": 0,
-                            }
-                            buffered_texts += output + ' '
-                            yield json.dumps(ret).encode() + b"\0"
+                        yield f"data: {output}\n\n"
                     yield f"data: [DONE]\n\n"
                     if is_plugin_enabled("cache") and \
                        not plugins["cache"]["instance"].pre_llm_inference_actions(request.prompt):

diff --git a/intel_extension_for_transformers/neural_chat/server/restful/request.py b/intel_extension_for_transformers/neural_chat/server/restful/request.py
@@ -39,7 +39,7 @@ class RetrievalRequest(RequestBaseModel):
 
 
 class FinetuneRequest(RequestBaseModel):
-    model_name_or_path: str = "meta-llama/Llama-2-7b-chat-hf"
+    model_name_or_path: str = "Intel/neural-chat-7b-v3-1"
     train_file: str = None
     dataset_name: str = None
     output_dir: str = './tmp'

diff --git a/intel_extension_for_transformers/neural_chat/ui/gradio/basic/README.md b/intel_extension_for_transformers/neural_chat/ui/gradio/basic/README.md
@@ -61,7 +61,7 @@ This will run the chatbot application in the background on your server.
 Once the application is running, you can find the access URL in the trace log:
 
 ```log
-INFO | gradio_web_server | Models: meta-llama/Llama-2-7b-chat-hf
+INFO | gradio_web_server | Models: Intel/neural-chat-7b-v3-1
 INFO | stdout | Running on local URL:  http://0.0.0.0:7860
 ```
 The URL to access the chatbot frontend is http://{SERVER_IP_ADDRESS}:80. Please remember to replace {SERVER_IP_ADDRESS} with your server's actual IP address.

diff --git a/intel_extension_for_transformers/neural_chat/ui/gradio/side_by_side/README.md b/intel_extension_for_transformers/neural_chat/ui/gradio/side_by_side/README.md
@@ -61,7 +61,7 @@ This will run the chatbot application in the background on your server.
 Once the application is running, you can find the access URL in the trace log:
 
 ```log
-INFO | gradio_web_server | Models: meta-llama/Llama-2-7b-chat-hf
+INFO | gradio_web_server | Models: Intel/neural-chat-7b-v3-1
 INFO | stdout | Running on local URL:  http://0.0.0.0:7860
 ```
 since there are two services, start two backends and generate URLs for both backends.