diff --git a/intel_extension_for_transformers/neural_chat/README.md b/intel_extension_for_transformers/neural_chat/README.md index 1f461b7f78c..4f72104b464 100644 --- a/intel_extension_for_transformers/neural_chat/README.md +++ b/intel_extension_for_transformers/neural_chat/README.md @@ -108,7 +108,7 @@ NeuralChat supports fine-tuning the pretrained large language model (LLM) for te ```shell # Command line -neuralchat finetune --base_model "meta-llama/Llama-2-7b-chat-hf" --config pipeline/finetuning/config/finetuning.yaml +neuralchat finetune --base_model "Intel/neural-chat-7b-v3-1" --config pipeline/finetuning/config/finetuning.yaml ``` ```python @@ -124,7 +124,7 @@ NeuralChat provides typical model optimization technologies, like `Automatic Mix ```shell # Command line -neuralchat optimize --base_model "meta-llama/Llama-2-7b-chat-hf" --config pipeline/optimization/config/optimization.yaml +neuralchat optimize --base_model "Intel/neural-chat-7b-v3-1" --config pipeline/optimization/config/optimization.yaml ``` ```python diff --git a/intel_extension_for_transformers/neural_chat/config.py b/intel_extension_for_transformers/neural_chat/config.py index 38badf25238..d0a7c359322 100644 --- a/intel_extension_for_transformers/neural_chat/config.py +++ b/intel_extension_for_transformers/neural_chat/config.py @@ -412,7 +412,7 @@ class LoadingModelConfig: class PipelineConfig: def __init__(self, - model_name_or_path="meta-llama/Llama-2-7b-chat-hf", + model_name_or_path="Intel/neural-chat-7b-v3-1", tokenizer_name_or_path=None, hf_access_token=None, device="auto", diff --git a/intel_extension_for_transformers/neural_chat/examples/deployment/photo_ai/backend/README.md b/intel_extension_for_transformers/neural_chat/examples/deployment/photo_ai/backend/README.md index 81fedb3e404..24a3286b36b 100644 --- a/intel_extension_for_transformers/neural_chat/examples/deployment/photo_ai/backend/README.md +++ b/intel_extension_for_transformers/neural_chat/examples/deployment/photo_ai/backend/README.md @@ -45,8 +45,8 @@ pip install -r ../../../requirements.txt ## Install Models ```shell git-lfs install -# download llama-2 model for NER plugin -git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf +# download neural-chat-7b-v3-1 model for NER plugin +git clone https://huggingface.co/Intel/neural-chat-7b-v3-1 # download spacy model for NER post process python -m spacy download en_core_web_lg ``` @@ -83,7 +83,7 @@ You can customize the configuration file `photoai.yaml` to match your environmen | ------------------- | ---------------------------------------| | host | 127.0.0.1 | | port | 9000 | -| model_name_or_path | "./Llama-2-7b-chat-hf" | +| model_name_or_path | "./neural-chat-7b-v3-1" | | device | "auto" | | asr.enable | true | | tts.enable | true | diff --git a/intel_extension_for_transformers/neural_chat/examples/deployment/rag/README.md b/intel_extension_for_transformers/neural_chat/examples/deployment/rag/README.md index f1d6ef9eb05..da4202e157f 100644 --- a/intel_extension_for_transformers/neural_chat/examples/deployment/rag/README.md +++ b/intel_extension_for_transformers/neural_chat/examples/deployment/rag/README.md @@ -46,7 +46,7 @@ pip install -r ../../../requirements.txt ## Download Models ```shell git-lfs install -git clone https://huggingface.co/meta-llama/Llama-2-7b-chat-hf +git clone https://huggingface.co/Intel/neural-chat-7b-v3-1 ``` @@ -58,7 +58,7 @@ You can customize the configuration file 'askdoc.yaml' to match your environment | --------------------------------- | ---------------------------------------| | host | 127.0.0.1 | | port | 8000 | -| model_name_or_path | "./Llama-2-7b-chat-hf" | +| model_name_or_path | "./neural-chat-7b-v3-1" | | device | "auto" | | retrieval.enable | true | | retrieval.args.input_path | "./docs" | diff --git a/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/server/backend/README.md b/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/server/backend/README.md index 3315ba8a6e2..f9525044023 100644 --- a/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/server/backend/README.md +++ b/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/server/backend/README.md @@ -46,7 +46,7 @@ You can customize the configuration file 'textbot.yaml' to match your environmen | --------------------- | --------------------------------------- | | host | 127.0.0.1 | | port | 8888 | -| model_name_or_path | "meta-llama/Llama-2-7b-chat-hf" | +| model_name_or_path | "Intel/neural-chat-7b-v3-1" | | device | "cpu" | | asr.enable | true | | asr.args.device | "cpu" | diff --git a/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/server/backend/talkingbot.yaml b/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/server/backend/talkingbot.yaml index f4fa601703e..2d1249d86f6 100644 --- a/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/server/backend/talkingbot.yaml +++ b/intel_extension_for_transformers/neural_chat/examples/deployment/talkingbot/server/backend/talkingbot.yaml @@ -23,7 +23,7 @@ host: 0.0.0.0 port: 8888 -model_name_or_path: "meta-llama/Llama-2-7b-chat-hf" +model_name_or_path: "Intel/neural-chat-7b-v3-1" device: "cpu" asr: diff --git a/intel_extension_for_transformers/neural_chat/examples/deployment/textbot/backend/xeon/README.md b/intel_extension_for_transformers/neural_chat/examples/deployment/textbot/backend/xeon/README.md index 6f51fd7ece2..05d082775cc 100644 --- a/intel_extension_for_transformers/neural_chat/examples/deployment/textbot/backend/xeon/README.md +++ b/intel_extension_for_transformers/neural_chat/examples/deployment/textbot/backend/xeon/README.md @@ -46,7 +46,7 @@ You can customize the configuration file 'textbot.yaml' to match your environmen | ------------------- | --------------------------------------- | | host | 127.0.0.1 | | port | 8000 | -| model_name_or_path | "meta-llama/Llama-2-7b-chat-hf" | +| model_name_or_path | "Intel/neural-chat-7b-v3-1" | | device | "cpu" | | tasks_list | ['textchat'] | diff --git a/intel_extension_for_transformers/neural_chat/examples/deployment/textbot/backend/xeon/textbot.yaml b/intel_extension_for_transformers/neural_chat/examples/deployment/textbot/backend/xeon/textbot.yaml index d37b5b3515e..9a912bc8fe0 100644 --- a/intel_extension_for_transformers/neural_chat/examples/deployment/textbot/backend/xeon/textbot.yaml +++ b/intel_extension_for_transformers/neural_chat/examples/deployment/textbot/backend/xeon/textbot.yaml @@ -23,7 +23,7 @@ host: 0.0.0.0 port: 8000 -model_name_or_path: "meta-llama/Llama-2-7b-chat-hf" +model_name_or_path: "Intel/neural-chat-7b-v3-1" device: "cpu" # users can choose one of the ipex int8, itrex int4, mix precision and diff --git a/intel_extension_for_transformers/neural_chat/examples/deployment/textbot/backend_with_cache/README.md b/intel_extension_for_transformers/neural_chat/examples/deployment/textbot/backend_with_cache/README.md index 6ba231f44be..26554bee030 100644 --- a/intel_extension_for_transformers/neural_chat/examples/deployment/textbot/backend_with_cache/README.md +++ b/intel_extension_for_transformers/neural_chat/examples/deployment/textbot/backend_with_cache/README.md @@ -40,13 +40,13 @@ pip install -r ../../../requirements.txt You can customize the configuration file 'textbot.yaml' to match your environment setup. Here's a table to help you understand the configurable options: | Item | Value | -| ------------------- | --------------------------------------- | -| host | 127.0.0.1 | -| port | 8000 | -| model_name_or_path | "meta-llama/Llama-2-7b-chat-hf" | -| device | "cpu" | -| cache.enable | true | -| tasks_list | ['textchat'] | +| ------------------- | ----------------------------------------- | +| host | 127.0.0.1 | +| port | 8000 | +| model_name_or_path | "Intel/neural-chat-7b-v3-1" | +| device | "cpu" | +| cache.enable | true | +| tasks_list | ['textchat'] | diff --git a/intel_extension_for_transformers/neural_chat/models/model_utils.py b/intel_extension_for_transformers/neural_chat/models/model_utils.py index b5433c79898..ce9a68173b7 100644 --- a/intel_extension_for_transformers/neural_chat/models/model_utils.py +++ b/intel_extension_for_transformers/neural_chat/models/model_utils.py @@ -742,7 +742,7 @@ def predict_stream(**params): `num_beams` (int): Controls the number of beams used in beam search. Higher values increase the diversity but also the computation time. `model_name` (string): Specifies the name of the pre-trained model to use for text generation. - If not provided, the default model is "mosaicml/mpt-7b-chat". + If not provided, the default model is "Intel/neural-chat-7b-v3-1". `num_return_sequences` (int): Specifies the number of alternative sequences to generate. `bad_words_ids` (list or None): Contains a list of token IDs that should not appear in the generated text. `force_words_ids` (list or None): Contains a list of token IDs that must be included in the generated text. @@ -768,7 +768,7 @@ def predict_stream(**params): do_sample = params["do_sample"] if "do_sample" in params else True num_beams = int(params["num_beams"]) if "num_beams" in params else 0 model_name = ( - params["model_name"] if "model_name" in params else "mosaicml/mpt-7b-chat" + params["model_name"] if "model_name" in params else "Intel/neural-chat-7b-v3-1" ) num_return_sequences = ( params["num_return_sequences"] if "num_return_sequences" in params else 1 @@ -789,7 +789,9 @@ def predict_stream(**params): if is_llm_runtime_model(model): prompt = remove_prompt_history(model_name, prompt) - max_new_tokens = max_new_tokens if max_new_tokens > 1024 else 1024 + max_new_tokens = max_new_tokens if (max_new_tokens > 1024 or \ + "codellama" in model_name.lower() or \ + "starcoder" in model_name.lower()) else 1024 streamer = TextIteratorStreamer( tokenizer, skip_prompt=True, skip_special_tokens=True @@ -1033,7 +1035,9 @@ def predict(**params): if is_llm_runtime_model(model): prompt = remove_prompt_history(model_name, prompt) - max_new_tokens = max_new_tokens if max_new_tokens > 1024 else 1024 + max_new_tokens = max_new_tokens if (max_new_tokens > 1024 or \ + "codellama" in model_name.lower() or \ + "starcoder" in model_name.lower()) else 1024 if num_beams == 0: num_beams = 1 diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/ner/README.md b/intel_extension_for_transformers/neural_chat/pipeline/plugins/ner/README.md index 0e56e22730c..f709cd7bb1d 100644 --- a/intel_extension_for_transformers/neural_chat/pipeline/plugins/ner/README.md +++ b/intel_extension_for_transformers/neural_chat/pipeline/plugins/ner/README.md @@ -54,7 +54,7 @@ print("NER result: ", result) ## Plugin Parameters You can costomize the NER inference parameters to meet the personal demands for better performance. You can set the specific parameter by `plugins.ner.args["xxx"]`. Below are the descriptions of each available parameters. ```python -model_name_or_path [str]: The huggingface model name or local path of the downloaded llm model. Default to "./Llama-2-7b-chat-hf/". +model_name_or_path [str]: The huggingface model name or local path of the downloaded llm model. Default to "./neural-chat-7b-v3-1/". spacy_model [str]: The Spacy model for NLP process, specify it according to the downloaded Spacy model. Default to "en_core_web_lg". @@ -62,7 +62,7 @@ bf16 [bool]: Choose wether to use BF16 precision for NER inference. Default to F ``` As for INT8 and INT4 model the plugin parameters are slightly different. You can set the specific parameter by `plugins.ner_int.args["xxx"]`. ```python -model_name_or_path [str]: The huggingface model name or local path of the downloaded llm model. Default to "./Llama-2-7b-chat-hf/". +model_name_or_path [str]: The huggingface model name or local path of the downloaded llm model. Default to "./neural-chat-7b-v3-1/". spacy_model [str]: The Spacy model for NLP process, specify it according to the downloaded Spacy model. Default to "en_core_web_lg". diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/ner/ner_int.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/ner/ner_int.py index da628dc7245..926a960b7d0 100644 --- a/intel_extension_for_transformers/neural_chat/pipeline/plugins/ner/ner_int.py +++ b/intel_extension_for_transformers/neural_chat/pipeline/plugins/ner/ner_int.py @@ -44,7 +44,7 @@ class NamedEntityRecognitionINT(): """ def __init__(self, - model_path="meta-llama/Llama-2-7b-chat-hf", + model_path="Intel/neural-chat-7b-v3-1", spacy_model="en_core_web_lg", compute_dtype="fp32", weight_dtype="int8", diff --git a/intel_extension_for_transformers/neural_chat/server/README.md b/intel_extension_for_transformers/neural_chat/server/README.md index f6099d5d5f1..9f723cf7931 100644 --- a/intel_extension_for_transformers/neural_chat/server/README.md +++ b/intel_extension_for_transformers/neural_chat/server/README.md @@ -16,7 +16,7 @@ NeuralChat provides a default chatbot configuration in `./config/neuralchat.yaml | ------------------------- | ------------------------ | --------------------------------------- | --------------------------------- | | host | | 0.0.0.0 | Any valid IP address | | port | | 8000 | Any valid port number | -| model_name_or_path | | "meta-llama/Llama-2-7b-chat-hf" | A valid model name or path | +| model_name_or_path | | "Intel/neural-chat-7b-v3-1" | A valid model name or path | | tokenizer_name_or_path | | "" | A tokenizer name or path | | peft_model_path | | "" | A peft model path | | device | | "auto" | "cpu", "hpu", "xpu", "cuda" | @@ -47,11 +47,11 @@ NeuralChat provides a default chatbot configuration in `./config/neuralchat.yaml | | args.embedding_model_dir | "hkunlp/instructor-large" | A valid directory path | | safety_checker | enable | false | true, false | | ner | enable | false | true, false | -| | args.model_path | "meta-llama/Llama-2-7b-chat-hf" | A valid directory path of llm model | +| | args.model_path | "Intel/neural-chat-7b-v3-1" | A valid directory path of llm model | | | args.spacy_model | "en_core_web_lg" | A valid name of downloaded spacy model | | | args.bf16 | false | true, false | | ner_int | enable | false | true, false | -| | args.model_path | "meta-llama/Llama-2-7b-chat-hf" | A valid directory path of llm model | +| | args.model_path | "Intel/neural-chat-7b-v3-1" | A valid directory path of llm model | | | args.spacy_model | "en_core_web_lg" | A valid name of downloaded spacy model | | | args.compute_dtype | "fp32" | "fp32", "int8" | | | args.weight_dtype | "int8" | "int8", "int4" | diff --git a/intel_extension_for_transformers/neural_chat/server/config/neuralchat.yaml b/intel_extension_for_transformers/neural_chat/server/config/neuralchat.yaml index 1d757b60f97..f87f9262d11 100644 --- a/intel_extension_for_transformers/neural_chat/server/config/neuralchat.yaml +++ b/intel_extension_for_transformers/neural_chat/server/config/neuralchat.yaml @@ -23,7 +23,7 @@ host: 0.0.0.0 port: 8000 -model_name_or_path: "meta-llama/Llama-2-7b-chat-hf" +model_name_or_path: "Intel/neural-chat-7b-v3-1" # tokenizer_name_or_path: "" # peft_model_path: "" device: "auto" @@ -80,7 +80,7 @@ ner: enable: false args: device: "cpu" - model_path: "meta-llama/Llama-2-7b-chat-hf" + model_path: "Intel/neural-chat-7b-v3-1" spacy_model: "en_core_web_lg" bf16: False @@ -88,7 +88,7 @@ ner_int: enable: false args: device: "cpu" - model_path: "meta-llama/Llama-2-7b-chat-hf" + model_path: "Intel/neural-chat-7b-v3-1" spacy_model: "en_core_web_lg" compute_dtype: "fp32" weight_dtype: "int8" diff --git a/intel_extension_for_transformers/neural_chat/server/restful/codegen_api.py b/intel_extension_for_transformers/neural_chat/server/restful/codegen_api.py index bf257385c7a..22be5bc444c 100644 --- a/intel_extension_for_transformers/neural_chat/server/restful/codegen_api.py +++ b/intel_extension_for_transformers/neural_chat/server/restful/codegen_api.py @@ -100,22 +100,7 @@ def handle_chat_completion_request(self, request: ChatCompletionRequest): def stream_generator(): nonlocal buffered_texts for output in generator: - if isinstance(output, str): - chunks = output.split() - for chunk in chunks: - ret = { - "text": chunk, - "error_code": 0, - } - buffered_texts += chunk + ' ' - yield json.dumps(ret).encode() + b"\0" - else: - ret = { - "text": output, - "error_code": 0, - } - buffered_texts += output + ' ' - yield json.dumps(ret).encode() + b"\0" + yield f"data: {output}\n\n" yield f"data: [DONE]\n\n" if is_plugin_enabled("cache") and \ not plugins["cache"]["instance"].pre_llm_inference_actions(request.prompt): diff --git a/intel_extension_for_transformers/neural_chat/server/restful/request.py b/intel_extension_for_transformers/neural_chat/server/restful/request.py index 7f1b42eb91d..288d5ca62d4 100644 --- a/intel_extension_for_transformers/neural_chat/server/restful/request.py +++ b/intel_extension_for_transformers/neural_chat/server/restful/request.py @@ -39,7 +39,7 @@ class RetrievalRequest(RequestBaseModel): class FinetuneRequest(RequestBaseModel): - model_name_or_path: str = "meta-llama/Llama-2-7b-chat-hf" + model_name_or_path: str = "Intel/neural-chat-7b-v3-1" train_file: str = None dataset_name: str = None output_dir: str = './tmp' diff --git a/intel_extension_for_transformers/neural_chat/ui/gradio/basic/README.md b/intel_extension_for_transformers/neural_chat/ui/gradio/basic/README.md index a0b2ebf6cac..88f32d01b25 100644 --- a/intel_extension_for_transformers/neural_chat/ui/gradio/basic/README.md +++ b/intel_extension_for_transformers/neural_chat/ui/gradio/basic/README.md @@ -61,7 +61,7 @@ This will run the chatbot application in the background on your server. Once the application is running, you can find the access URL in the trace log: ```log -INFO | gradio_web_server | Models: meta-llama/Llama-2-7b-chat-hf +INFO | gradio_web_server | Models: Intel/neural-chat-7b-v3-1 INFO | stdout | Running on local URL: http://0.0.0.0:7860 ``` The URL to access the chatbot frontend is http://{SERVER_IP_ADDRESS}:80. Please remember to replace {SERVER_IP_ADDRESS} with your server's actual IP address. diff --git a/intel_extension_for_transformers/neural_chat/ui/gradio/side_by_side/README.md b/intel_extension_for_transformers/neural_chat/ui/gradio/side_by_side/README.md index ddcc389ff44..0467c820740 100644 --- a/intel_extension_for_transformers/neural_chat/ui/gradio/side_by_side/README.md +++ b/intel_extension_for_transformers/neural_chat/ui/gradio/side_by_side/README.md @@ -61,7 +61,7 @@ This will run the chatbot application in the background on your server. Once the application is running, you can find the access URL in the trace log: ```log -INFO | gradio_web_server | Models: meta-llama/Llama-2-7b-chat-hf +INFO | gradio_web_server | Models: Intel/neural-chat-7b-v3-1 INFO | stdout | Running on local URL: http://0.0.0.0:7860 ``` since there are two services, start two backends and generate URLs for both backends.