From ad4becdc982793275b2f3ef9bd68a89a34c9b2e4 Mon Sep 17 00:00:00 2001
From: jiafu zhang <jiafu.zhang@intel.com>
Date: Wed, 23 Aug 2023 09:35:51 +0800
Subject: [PATCH] CI add inference test for mosaicml-mpt-7b-chat (#157)

Signed-off-by: jiafu zhang <jiafu.zhang@intel.com>
---
 .../chatbot-inference-llama-2-7b-chat-hf.yml  | 12 +++---
 .../chatbot-inference-mpt-7b-chat.yml         | 41 +++++++++++++++++++
 .github/workflows/chatbot-test.yml            |  5 +++
 workflows/chatbot/inference/generate.py       | 10 ++---
 4 files changed, 58 insertions(+), 10 deletions(-)
 create mode 100644 .github/workflows/chatbot-inference-mpt-7b-chat.yml

diff --git a/.github/workflows/chatbot-inference-llama-2-7b-chat-hf.yml b/.github/workflows/chatbot-inference-llama-2-7b-chat-hf.yml
index c74d28c7741..ae514f36c77 100644
--- a/.github/workflows/chatbot-inference-llama-2-7b-chat-hf.yml
+++ b/.github/workflows/chatbot-inference-llama-2-7b-chat-hf.yml
@@ -3,9 +3,8 @@ name: Chatbot inference on llama-2-7b-chat-hf
 on:
   workflow_call:
 
-# If there is a new commit, the previous jobs will be canceled
 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-inf-lla-7b
   cancel-in-progress: true
 
 jobs:
@@ -16,18 +15,21 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v2
 
+      - name: Load environment variables
+        run: cat ~/itrex-actions-runner/.env >> $GITHUB_ENV
+
       - name: Build Docker Image
-        run: docker build ./ --target cpu --build-arg http_proxy="$HTTP_PROXY_IMAGE_BUILD" --build-arg https_proxy="$HTTPS_PROXY_IMAGE_BUILD" -f workflows/chatbot/inference/docker/Dockerfile -t chatbotinfer:latest && yes | docker container prune && yes | docker image prune
+        run: docker build ./ --target cpu --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -f workflows/chatbot/inference/docker/Dockerfile -t chatbotinfer:latest && yes | docker container prune && yes | docker image prune
 
       - name: Start Docker Container
         run: |
           cid=$(docker ps -q --filter "name=chatbotinfer")
           if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
-          docker run -tid -v /mnt/DP_disk1/huggingface/cache/:/root/.cache/huggingface/hub -v .:/root/chatbot -e http_proxy="$HTTP_PROXY_CONTAINER_RUN" -e https_proxy="$HTTPS_PROXY_CONTAINER_RUN" --name="chatbotinfer" --hostname="chatbotinfer-container" chatbotinfer:latest
+          docker run -tid -v /mnt/DP_disk1/huggingface/cache/:/root/.cache/huggingface/hub -v .:/root/chatbot -e http_proxy="${{ env.HTTP_PROXY_CONTAINER_RUN }}" -e https_proxy="${{ env.HTTPS_PROXY_CONTAINER_RUN }}" --name="chatbotinfer" --hostname="chatbotinfer-container" chatbotinfer:latest
 
       - name: Run Inference Test
         run: |
-          docker exec "chatbotinfer" bash -c "cd /root/chatbot && source activate && conda activate chatbot-demo; python workflows/chatbot/inference/generate.py --base_model_path \"meta-llama/Llama-2-7b-chat-hf\" --hf_access_token \"$HF_ACCESS_TOKEN\" --instructions \"Transform the following sentence into one that shows contrast. The tree is rotten.\" "
+          docker exec "chatbotinfer" bash -c "cd /root/chatbot && source activate && conda activate chatbot-demo; python workflows/chatbot/inference/generate.py --base_model_path \"meta-llama/Llama-2-7b-chat-hf\" --hf_access_token \"${{ env.HF_ACCESS_TOKEN }}\" --instructions \"Transform the following sentence into one that shows contrast. The tree is rotten.\" "
 
       - name: Stop Container
         if: success() || failure()
diff --git a/.github/workflows/chatbot-inference-mpt-7b-chat.yml b/.github/workflows/chatbot-inference-mpt-7b-chat.yml
new file mode 100644
index 00000000000..aa9ff338243
--- /dev/null
+++ b/.github/workflows/chatbot-inference-mpt-7b-chat.yml
@@ -0,0 +1,41 @@
+name: Chatbot inference on mosaicml/mpt-7b-chat
+
+on:
+  workflow_call:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-inf-mpt-7b
+  cancel-in-progress: true
+
+jobs:
+  inference:
+    name: inference test
+    runs-on: lms-lab
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Load environment variables
+        run: cat ~/itrex-actions-runner/.env >> $GITHUB_ENV
+
+      - name: Build Docker Image
+        run: docker build ./ --target cpu --build-arg http_proxy="${{ env.HTTP_PROXY_IMAGE_BUILD }}" --build-arg https_proxy="${{ env.HTTPS_PROXY_IMAGE_BUILD }}" -f workflows/chatbot/inference/docker/Dockerfile -t chatbotinfer:latest && yes | docker container prune && yes | docker image prune
+
+      - name: Start Docker Container
+        run: |
+          cid=$(docker ps -q --filter "name=chatbotinfer")
+          if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
+          docker run -tid -v /mnt/DP_disk1/huggingface/cache/:/root/.cache/huggingface/hub -v .:/root/chatbot -e http_proxy="${{ env.HTTP_PROXY_CONTAINER_RUN }}" -e https_proxy="${{ env.HTTPS_PROXY_CONTAINER_RUN }}" --name="chatbotinfer" --hostname="chatbotinfer-container" chatbotinfer:latest
+
+      - name: Run Inference Test
+        run: |
+          docker exec "chatbotinfer" bash -c "cd /root/chatbot && source activate && conda activate chatbot-demo; python workflows/chatbot/inference/generate.py --base_model_path \"mosaicml/mpt-7b-chat\" --instructions \"Transform the following sentence into one that shows contrast. The tree is rotten.\" "
+
+      - name: Stop Container
+        if: success() || failure()
+        run: |
+          cid=$(docker ps -q --filter "name=chatbotinfer")
+          if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi
+
+      - name: Test Summary
+        run: echo "Inference completed successfully"
diff --git a/.github/workflows/chatbot-test.yml b/.github/workflows/chatbot-test.yml
index 15f2e57ab6c..afc365dfe72 100644
--- a/.github/workflows/chatbot-test.yml
+++ b/.github/workflows/chatbot-test.yml
@@ -8,6 +8,7 @@ on:
       - './requirements.txt'
       - '.github/workflows/chatbot-test.yml'
       - '.github/workflows/chatbot-inference-llama-2-7b-chat-hf.yml'
+      - '.github/workflows/chatbot-inference-mpt-7b-chat.yml'
       - 'intel_extension_for_transformers/**'
       - 'workflows/chatbot/inference/**'
       - 'workflows/dlsa/**'
@@ -25,3 +26,7 @@ jobs:
   call-inference-llama-2-7b-chat-hf:
     uses: ./.github/workflows/chatbot-inference-llama-2-7b-chat-hf.yml
 
+  call-inference-mpt-7b-chat:
+    uses: ./.github/workflows/chatbot-inference-mpt-7b-chat.yml
+
+
diff --git a/workflows/chatbot/inference/generate.py b/workflows/chatbot/inference/generate.py
index f340446e5b2..7af13146933 100644
--- a/workflows/chatbot/inference/generate.py
+++ b/workflows/chatbot/inference/generate.py
@@ -370,12 +370,12 @@ def load_model(
         tokenizer_name,
         use_fast=False if (re.search("llama", model_name, re.IGNORECASE)
             or re.search("neural-chat-7b-v2", model_name, re.IGNORECASE)) else True,
-        token=hf_access_token,
+        use_auth_token=hf_access_token,
     )
     if re.search("flan-t5", model_name, re.IGNORECASE):
         with smart_context_manager(use_deepspeed=use_deepspeed):
             model = AutoModelForSeq2SeqLM.from_pretrained(
-                model_name, low_cpu_mem_usage=True, token=hf_access_token
+                model_name, low_cpu_mem_usage=True, use_auth_token=hf_access_token
             )
     elif (re.search("mpt", model_name, re.IGNORECASE)
         or re.search("neural-chat-7b-v1", model_name, re.IGNORECASE)):
@@ -388,7 +388,7 @@ def load_model(
                 torch_dtype=torch.bfloat16,
                 low_cpu_mem_usage=True,
                 torchscript=cpu_jit,
-                token=hf_access_token,
+                use_auth_token=hf_access_token,
             )
     elif (
         re.search("gpt", model_name, re.IGNORECASE)
@@ -399,7 +399,7 @@ def load_model(
     ):
         with smart_context_manager(use_deepspeed=use_deepspeed):
             model = AutoModelForCausalLM.from_pretrained(
-                model_name, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, token=hf_access_token
+                model_name, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, use_auth_token=hf_access_token
             )
     else:
         raise ValueError(
@@ -477,7 +477,7 @@ def load_model(
             from models.mpt.mpt_trace import jit_trace_mpt_7b, MPTTSModelForCausalLM
 
             model = jit_trace_mpt_7b(model)
-            config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, token=hf_access_token)
+            config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, use_auth_token=hf_access_token)
             model = MPTTSModelForCausalLM(
                 model, config, use_cache=use_cache, model_dtype=torch.bfloat16
             )