intel · hshen14 · Nov 3, 2023 · Nov 3, 2023 · Nov 3, 2023 · Nov 3, 2023
diff --git a/...les/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/README.md b/...les/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/README.md
@@ -0,0 +1,107 @@
+Step-by-Step
+=======
+This document describes the end-to-end workflow for Huggingface model [BERT Base Uncased](https://huggingface.co/textattack/bert-base-uncased-MRPC) with Neural Engine backend.
+
+# Prerequisite
+## Prepare Python Environment
+Create a python environment, optionally with autoconf for jemalloc support.
+```shell
+conda create -n <env name> python=3.8 [autoconf]
+conda activate <env name>
+```
+
+Check that `gcc` version is higher than 9.0.
+```shell
+gcc -v
+```
+
+Install Intel® Extension for Transformers, please refer to [installation](/docs/installation.md).
+```shell
+# Install from pypi
+pip install intel-extension-for-transformers
+
+# Or, install from source code
+cd <intel_extension_for_transformers_folder>
+pip install -r requirements.txt
+pip install -v .
+```
+
+Install required dependencies for this example
+```shell
+cd <intel_extension_for_transformers_folder>/examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base
+pip install -r requirements.txt
+```
+>**Note**: Recommend install protobuf <= 3.20.0 if use onnxruntime <= 1.11
+
+## Environment Variables (Optional)
+```shell
+# Preload libjemalloc.so may improve the performance when inference under multi instance.
+conda install jemalloc==5.2.1 -c conda-forge -y
+export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libjemalloc.so
+
+# Using weight sharing can save memory and may improve the performance when multi instances.
+export WEIGHT_SHARING=1
+export INST_NUM=<inst num>
+```
+>**Note**: This step is optional.
+
+# Inference Pipeline
+
+Neural Engine can parse ONNX model and Neural Engine IR. 
+We provide with three `mode`s: `accuracy`, `throughput` or `latency`. For throughput mode, we will use multi-instance with 4cores/instance occupying one socket.
+You can run fp32 model inference by setting `precision=fp32`, command as follows:
+```shell
+bash run_bert_base.sh --model=textattack/bert-base-uncased-MRPC  --dataset=mrpc --precision=fp32 --mode=throughput
+```
+By setting `precision=int8` you could get PTQ int8 model and setting `precision=bf16` to get bf16 model.
+```shell
+bash run_bert_base.sh --model=textattack/bert-base-uncased-MRPC  --dataset=mrpc --precision=int8 --mode=throughput
+```
+By setting `precision=dynamic_int8`, you could benchmark dynamic quantized int8 model.
+```shell
+bash run_bert_base.sh --model=textattack/bert-base-uncased-MRPC  --dataset=mrpc --precision=dynamic_int8 --mode=throughput
+```
+
+
+You could also compile the model to IR using python API as follows:
+```python
+from intel_extension_for_transformers.llm.runtime.deprecated.compile import compile
+graph = compile('./model_and_tokenizer/int8-model.onnx')
+graph.save('./ir')
+```
+
+# Benchmark
+If you want to run local onnx model inference, we provide with python API and C++ API. To use C++ API, you need to transfer to model ir fisrt.
+
+By setting `--dynamic_quanzite` for FP32 model, you could benchmark dynamic quantize int8 model.
+## Accuracy
+Python API Command as follows:
+```shell
+GLOG_minloglevel=2 python run_executor.py --input_model=./model_and_tokenizer/int8-model.onnx  --tokenizer_dir=./model_and_tokenizer --mode=accuracy --dataset_name=glue --task_name=mrpc --batch_size=8
+```
+
+If you just want a quick start, you could try a small set of dataset, like this:
+```shell
+GLOG_minloglevel=2 python run_executor.py --input_model=./model_and_tokenizer/int8-model.onnx  --tokenizer_dir=./model_and_tokenizer --mode=accuracy --dataset_name=glue --task_name=mrpc --batch_size=8 --max_eval_samples=10
+```
+
+>**Note**: The accuracy of partial dataset is unauthentic.
+
+## Performance
+Python API command as follows:
+```shell
+GLOG_minloglevel=2 python run_executor.py --input_model=./model_and_tokenizer/int8-model.onnx --mode=performance --dataset_name=glue --task_name=mrpc  --batch_size=1 --seq_len=128
+```
+
+You could use C++ API as well. First, you need to compile the model to IR. And then, you could run C++.
+
+> **Note**: The warmup below is recommended to be 1/10 of iterations and no less than 3.
+```shell
+export GLOG_minloglevel=2
+export OMP_NUM_THREADS=<cpu_cores>
+export DNNL_MAX_CPU_ISA=AVX512_CORE_AMX
+export UNIFIED_BUFFER=1
+numactl -C 0-<cpu_cores-1> neural_engine \
+  --batch_size=<batch_size> --iterations=<iterations> --w=<warmup> \
+  --seq_len=128 --config=./ir/conf.yaml --weight=./ir/model.bin
+```
diff --git a/.../huggingface/pytorch/text-classification/deployment/mrpc/bge_large/executor_dataloader.py b/.../huggingface/pytorch/text-classification/deployment/mrpc/bge_large/executor_dataloader.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy as np
+from transformers import AutoTokenizer
+from datasets import load_dataset
+
+class DataLoader(object):
+    def __init__(self, batch_size, seq_len, dataset_name, task_name, data_dir, tokenizer_dir):
+        self.batch_size = batch_size
+        dataset = load_dataset(dataset_name, task_name, cache_dir=data_dir, split='validation')
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+        self.dataset = dataset.map(lambda e: tokenizer(e['sentence1'], e['sentence2'],
+                    truncation=True, padding='max_length', max_length=seq_len), batched=True)
+
+    def __getitem__(self, idx):
+        start = idx * self.batch_size
+        end = start + self.batch_size
+        if end > len(self.dataset):
+            input_ids_data = self.dataset[start:]['input_ids']
+            segment_ids_data = self.dataset[start:]['token_type_ids']
+            input_mask_data = self.dataset[start:]['attention_mask']
+            label_data = self.dataset[start:]['label']
+        else:
+            input_ids_data = self.dataset[start:end]['input_ids']
+            segment_ids_data = self.dataset[start:end]['token_type_ids']
+            input_mask_data = self.dataset[start:end]['attention_mask']
+            label_data = self.dataset[start:end]['label']
+
+        sample_size = len(input_ids_data) if isinstance(input_ids_data, list) else 1
+
+        return [np.array(input_ids_data).reshape(sample_size, -1).astype('int32'),
+                np.array(segment_ids_data).reshape(sample_size, -1).astype('int32'),
+                np.array(input_mask_data).reshape(sample_size, -1).astype('int32')], \
+                np.array(label_data).reshape(sample_size, -1).astype('int32')
+
+    def __len__(self):
+        return math.ceil(len(self.dataset)/self.batch_size)
diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/executor_utils.py b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/executor_utils.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import numpy as np
+from tqdm import tqdm
+from datasets import load_metric
+from executor_dataloader import DataLoader
+import sys
+import os
+
+common_dir = os.path.join(sys.path[0], "../../../../neural_engine_utils/")
+sys.path.append(common_dir)
+from common import (log, DummyDataLoader, compute_performance, Neural_Engine_base)
+
+
+class Neural_Engine(Neural_Engine_base):
+
+    def accuracy(self, batch_size, seq_len, dataset_name, task_name, data_dir, tokenizer_dir):
+        # load dataset
+        log.info("Load dataset ......")
+        dataset = DataLoader(batch_size, seq_len, dataset_name, task_name, data_dir, tokenizer_dir)
+        # load metric
+        log.info("Load metric ......")
+        if dataset_name and task_name is not None:
+            metric = load_metric(dataset_name, task_name)
+        else:
+            metric = load_metric("accuracy")
+        # execute
+        log.info("Start engine ......")
+        for idx in tqdm(range(len(dataset))):
+            inputs = dataset[idx][0]
+            labels = dataset[idx][1]
+            predictions = self.graph.inference(inputs)
+            predictions = list(predictions.values())[0]
+            predictions = np.argmax(predictions, axis=1)
+            metric.add_batch(
+                predictions=predictions,
+                references=labels,
+            )
+        # compute metrics
+        log.info("Compute metrics ......")
+        eval_metric = metric.compute()
+        accuracy_value = eval_metric.get("accuracy")
+        f1_value = eval_metric.get("f1")
+        log.info(f"Accuracy: {accuracy_value}")
+        log.info(f"F1: {f1_value}")
+
+    def performance(self, batch_size, seq_len, iteration, warm_up):
+        if warm_up >= iteration:
+            log.error("Warm up should less than iteration.")
+            raise ValueError()
+        # generate dummy dataset
+        log.info("Generate dummy dataset ......")
+        shape = [batch_size, seq_len]
+        dataset = DummyDataLoader(shapes=[shape, shape, shape],
+                                  lows=[1, 1, 1],
+                                  highs=[128, 1, 1],
+                                  dtypes=['int32', 'int32', 'int32'],
+                                  iteration=iteration)
+        compute_performance(dataset, self.graph, log, self.log_file, warm_up, batch_size, seq_len)
diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/requirements.txt
@@ -0,0 +1,10 @@
+neural-compressor
+transformers
+accelerate
+datasets >= 1.8.0
+sentencepiece != 0.1.92
+protobuf
+torch==2.1.0
+onnx>=1.12
+onnxruntime==1.13.1
+