From c399e38c8c6045b719cd249a347262cb0ea226b3 Mon Sep 17 00:00:00 2001 From: Zhenzhong1 <109137058+Zhenzhong1@users.noreply.github.com> Date: Fri, 3 Nov 2023 20:41:12 +0800 Subject: [PATCH] [Example] BAAI/bge-large-en-v1.5 & BAAI/bge-base-en-v1.5 examples add (#619) * initialize the bge-large * initialize the bge_large * the bge_base inferneces pass --- .../deployment/mrpc/bge_large/README.md | 107 +++ .../mrpc/bge_large/executor_dataloader.py | 53 ++ .../mrpc/bge_large/executor_utils.py | 75 +++ .../mrpc/bge_large/requirements.txt | 10 + .../mrpc/bge_large/run_bert_base.sh | 224 +++++++ .../deployment/mrpc/bge_large/run_executor.py | 70 ++ .../deployment/mrpc/bge_large/run_glue.py | 623 ++++++++++++++++++ 7 files changed, 1162 insertions(+) create mode 100644 examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/README.md create mode 100644 examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/executor_dataloader.py create mode 100644 examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/executor_utils.py create mode 100644 examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/requirements.txt create mode 100644 examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/run_bert_base.sh create mode 100644 examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/run_executor.py create mode 100644 examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/run_glue.py diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/README.md b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/README.md new file mode 100644 index 00000000000..9cb756866dc --- /dev/null +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/README.md @@ -0,0 +1,107 @@ +Step-by-Step +======= +This document describes the end-to-end workflow for Huggingface model [BERT Base Uncased](https://huggingface.co/textattack/bert-base-uncased-MRPC) with Neural Engine backend. + +# Prerequisite +## Prepare Python Environment +Create a python environment, optionally with autoconf for jemalloc support. +```shell +conda create -n python=3.8 [autoconf] +conda activate +``` + +Check that `gcc` version is higher than 9.0. +```shell +gcc -v +``` + +Install IntelĀ® Extension for Transformers, please refer to [installation](/docs/installation.md). +```shell +# Install from pypi +pip install intel-extension-for-transformers + +# Or, install from source code +cd +pip install -r requirements.txt +pip install -v . +``` + +Install required dependencies for this example +```shell +cd /examples/huggingface/pytorch/text-classification/deployment/mrpc/bert_base +pip install -r requirements.txt +``` +>**Note**: Recommend install protobuf <= 3.20.0 if use onnxruntime <= 1.11 + +## Environment Variables (Optional) +```shell +# Preload libjemalloc.so may improve the performance when inference under multi instance. +conda install jemalloc==5.2.1 -c conda-forge -y +export LD_PRELOAD=${LD_PRELOAD}:${CONDA_PREFIX}/lib/libjemalloc.so + +# Using weight sharing can save memory and may improve the performance when multi instances. +export WEIGHT_SHARING=1 +export INST_NUM= +``` +>**Note**: This step is optional. + +# Inference Pipeline + +Neural Engine can parse ONNX model and Neural Engine IR. +We provide with three `mode`s: `accuracy`, `throughput` or `latency`. For throughput mode, we will use multi-instance with 4cores/instance occupying one socket. +You can run fp32 model inference by setting `precision=fp32`, command as follows: +```shell +bash run_bert_base.sh --model=textattack/bert-base-uncased-MRPC --dataset=mrpc --precision=fp32 --mode=throughput +``` +By setting `precision=int8` you could get PTQ int8 model and setting `precision=bf16` to get bf16 model. +```shell +bash run_bert_base.sh --model=textattack/bert-base-uncased-MRPC --dataset=mrpc --precision=int8 --mode=throughput +``` +By setting `precision=dynamic_int8`, you could benchmark dynamic quantized int8 model. +```shell +bash run_bert_base.sh --model=textattack/bert-base-uncased-MRPC --dataset=mrpc --precision=dynamic_int8 --mode=throughput +``` + + +You could also compile the model to IR using python API as follows: +```python +from intel_extension_for_transformers.llm.runtime.deprecated.compile import compile +graph = compile('./model_and_tokenizer/int8-model.onnx') +graph.save('./ir') +``` + +# Benchmark +If you want to run local onnx model inference, we provide with python API and C++ API. To use C++ API, you need to transfer to model ir fisrt. + +By setting `--dynamic_quanzite` for FP32 model, you could benchmark dynamic quantize int8 model. +## Accuracy +Python API Command as follows: +```shell +GLOG_minloglevel=2 python run_executor.py --input_model=./model_and_tokenizer/int8-model.onnx --tokenizer_dir=./model_and_tokenizer --mode=accuracy --dataset_name=glue --task_name=mrpc --batch_size=8 +``` + +If you just want a quick start, you could try a small set of dataset, like this: +```shell +GLOG_minloglevel=2 python run_executor.py --input_model=./model_and_tokenizer/int8-model.onnx --tokenizer_dir=./model_and_tokenizer --mode=accuracy --dataset_name=glue --task_name=mrpc --batch_size=8 --max_eval_samples=10 +``` + +>**Note**: The accuracy of partial dataset is unauthentic. + +## Performance +Python API command as follows: +```shell +GLOG_minloglevel=2 python run_executor.py --input_model=./model_and_tokenizer/int8-model.onnx --mode=performance --dataset_name=glue --task_name=mrpc --batch_size=1 --seq_len=128 +``` + +You could use C++ API as well. First, you need to compile the model to IR. And then, you could run C++. + +> **Note**: The warmup below is recommended to be 1/10 of iterations and no less than 3. +```shell +export GLOG_minloglevel=2 +export OMP_NUM_THREADS= +export DNNL_MAX_CPU_ISA=AVX512_CORE_AMX +export UNIFIED_BUFFER=1 +numactl -C 0- neural_engine \ + --batch_size= --iterations= --w= \ + --seq_len=128 --config=./ir/conf.yaml --weight=./ir/model.bin +``` diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/executor_dataloader.py b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/executor_dataloader.py new file mode 100644 index 00000000000..8df7c6908de --- /dev/null +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/executor_dataloader.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import numpy as np +from transformers import AutoTokenizer +from datasets import load_dataset + +class DataLoader(object): + def __init__(self, batch_size, seq_len, dataset_name, task_name, data_dir, tokenizer_dir): + self.batch_size = batch_size + dataset = load_dataset(dataset_name, task_name, cache_dir=data_dir, split='validation') + tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir) + self.dataset = dataset.map(lambda e: tokenizer(e['sentence1'], e['sentence2'], + truncation=True, padding='max_length', max_length=seq_len), batched=True) + + def __getitem__(self, idx): + start = idx * self.batch_size + end = start + self.batch_size + if end > len(self.dataset): + input_ids_data = self.dataset[start:]['input_ids'] + segment_ids_data = self.dataset[start:]['token_type_ids'] + input_mask_data = self.dataset[start:]['attention_mask'] + label_data = self.dataset[start:]['label'] + else: + input_ids_data = self.dataset[start:end]['input_ids'] + segment_ids_data = self.dataset[start:end]['token_type_ids'] + input_mask_data = self.dataset[start:end]['attention_mask'] + label_data = self.dataset[start:end]['label'] + + sample_size = len(input_ids_data) if isinstance(input_ids_data, list) else 1 + + return [np.array(input_ids_data).reshape(sample_size, -1).astype('int32'), + np.array(segment_ids_data).reshape(sample_size, -1).astype('int32'), + np.array(input_mask_data).reshape(sample_size, -1).astype('int32')], \ + np.array(label_data).reshape(sample_size, -1).astype('int32') + + def __len__(self): + return math.ceil(len(self.dataset)/self.batch_size) diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/executor_utils.py b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/executor_utils.py new file mode 100644 index 00000000000..9fc90d0e49a --- /dev/null +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/executor_utils.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +import numpy as np +from tqdm import tqdm +from datasets import load_metric +from executor_dataloader import DataLoader +import sys +import os + +common_dir = os.path.join(sys.path[0], "../../../../neural_engine_utils/") +sys.path.append(common_dir) +from common import (log, DummyDataLoader, compute_performance, Neural_Engine_base) + + +class Neural_Engine(Neural_Engine_base): + + def accuracy(self, batch_size, seq_len, dataset_name, task_name, data_dir, tokenizer_dir): + # load dataset + log.info("Load dataset ......") + dataset = DataLoader(batch_size, seq_len, dataset_name, task_name, data_dir, tokenizer_dir) + # load metric + log.info("Load metric ......") + if dataset_name and task_name is not None: + metric = load_metric(dataset_name, task_name) + else: + metric = load_metric("accuracy") + # execute + log.info("Start engine ......") + for idx in tqdm(range(len(dataset))): + inputs = dataset[idx][0] + labels = dataset[idx][1] + predictions = self.graph.inference(inputs) + predictions = list(predictions.values())[0] + predictions = np.argmax(predictions, axis=1) + metric.add_batch( + predictions=predictions, + references=labels, + ) + # compute metrics + log.info("Compute metrics ......") + eval_metric = metric.compute() + accuracy_value = eval_metric.get("accuracy") + f1_value = eval_metric.get("f1") + log.info(f"Accuracy: {accuracy_value}") + log.info(f"F1: {f1_value}") + + def performance(self, batch_size, seq_len, iteration, warm_up): + if warm_up >= iteration: + log.error("Warm up should less than iteration.") + raise ValueError() + # generate dummy dataset + log.info("Generate dummy dataset ......") + shape = [batch_size, seq_len] + dataset = DummyDataLoader(shapes=[shape, shape, shape], + lows=[1, 1, 1], + highs=[128, 1, 1], + dtypes=['int32', 'int32', 'int32'], + iteration=iteration) + compute_performance(dataset, self.graph, log, self.log_file, warm_up, batch_size, seq_len) diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/requirements.txt b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/requirements.txt new file mode 100644 index 00000000000..7661f639bc8 --- /dev/null +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/requirements.txt @@ -0,0 +1,10 @@ +neural-compressor +transformers +accelerate +datasets >= 1.8.0 +sentencepiece != 0.1.92 +protobuf +torch==2.1.0 +onnx>=1.12 +onnxruntime==1.13.1 + diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/run_bert_base.sh b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/run_bert_base.sh new file mode 100644 index 00000000000..f2e133f1d9e --- /dev/null +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/run_bert_base.sh @@ -0,0 +1,224 @@ +#!/bin/bash +# set -x +#!/bin/bash + +# Copyright (C) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# + +OUTPUT_DIR=$(pwd) +LOG_NAME="bert_base_mrpc.log" +DATASET="mrpc" +MODEL_NAME_OR_PATH="textattack/bert-base-uncased-MRPC" +BATCH_SIZE=8 +WARM_UP=100 +SEQUENCE_LEN=128 +ITERATION=1000 +PRECISION="int8" +CACHE_DIR="./tmp" +MODE="performance" + +for var in "$@" +do + case $1 in + --log_name=*) + LOG_NAME=$(echo $var |cut -f2 -d=) + echo "log name prefix is $LOG_NAME" + ;; + --output=*) + OUTPUT_DIR=$(echo $var |cut -f2 -d=) + echo "output location is $OUTPUT_DIR" + ;; + --dataset=*) + DATASET=$(echo $var |cut -f2 -d=) + echo "dataset is : $DATASET" + ;; + --cache_dir=*) + CACHE_DIR=$(echo $var |cut -f2 -d=) + echo "cache location is : $CACHE_DIR" + ;; + --sequence_len=*) + SEQUENCE_LEN=$(echo $var |cut -f2 -d=) + echo "sequence_len is : $SEQUENCE_LEN" + ;; + --model=*) + MODEL_NAME_OR_PATH=$(echo $var |cut -f2 -d=) + echo "The MODEL_NAME_OR_PATH is : $MODEL_NAME_OR_PATH" + ;; + --precision=*) + PRECISION=$(echo $var |cut -f2 -d=) + echo "The PRECISION is : $PRECISION" + ;; + --batch_size=*) + BATCH_SIZE=$(echo $var |cut -f2 -d=) + echo "batch size for inference is: $BATCH_SIZE" + ;; + --warm_up=*) + WARM_UP=$(echo $var |cut -f2 -d=) + echo "warm up for inference is: $WARM_UP" + ;; + --iteration=*) + ITERATION=$(echo $var |cut -f2 -d=) + echo "iteration for inference is: $ITERATION" + ;; + --mode=*) + MODE=$(echo $var |cut -f2 -d=) + echo "inference mode is: $MODE" + ;; + -h | --help ) + echo "Usage: bash $0 [OPTIONS]" + echo "OPTION includes:" + echo " --LOG_NAME - the log name prefix" + echo " --OUTPUT_DIR - the output location" + echo " --dataset - dataset for model optimization" + echo " --sequence_len - max sequence length" + echo " --cache_dir - use cache to speed up" + echo " --model - the input model name or path" + echo " --precision - model precision for inference" + echo " --mode - inference mode, choose from accuracy or throughput or latency" + echo " --batch_size - batch size for inference" + echo " --iteraion - iteration for inference" + echo " --warm_up - warm up steps for inference" + echo " --help - displays this message" + exit + ;; + * ) + echo "Invalid option: $1" + echo "Usage: bash $0 [OPTIONS]" + echo "OPTION includes:" + echo " --OUTPUT_DIR - the output location" + echo " --dataset - dataset for model optimization" + echo " --sequence_len - max sequence length" + echo " --cache_dir - use cache to speed up" + echo " --model - the input model name or path" + echo " --precision - model precision for inference" + echo " --mode - inference mode, choose from accuracy or throughput or latency" + echo " --batch_size - batch size for inference" + echo " --iteraion - iteration for inference" + echo " --warm_up - warm up steps for inference" + exit + ;; + esac + shift +done +## +inference_model="./model_and_tokenizer/${PRECISION}-model.onnx" +if [[ ${PRECISION} = 'dynamic_int8' ]]; then + inference_model="./model_and_tokenizer/fp32-model.onnx" +fi +if [[ -f ${inference_model} ]]; then + echo "===== Load ONNX Model ${MODEL_NAME_OR_PATH} from local ======" +else + echo "========== Prepare Model ${MODEL_NAME_OR_PATH} with Precision ${PRECISION} ========" + mode_cmd="" + if [[ ${PRECISION} = 'int8' ]]; then + mode_cmd=$mode_cmd" --tune --quantization_approach PostTrainingStatic" + elif [[ ${PRECISION} = 'bf16' ]]; then + mode_cmd=$mode_cmd" --enable_bf16" + fi + echo "tmp output, will remove later"${mode_cmd} + + python run_glue.py \ + --model_name_or_path ${MODEL_NAME_OR_PATH} \ + --task_name ${DATASET} \ + --do_train \ + --do_eval \ + --cache_dir ${CACHE_DIR} \ + --output_dir "model_and_tokenizer" \ + --overwrite_output_dir \ + --to_onnx \ + ${mode_cmd} 2>&1 | tee "$OUTPUT_DIR/$LOG_NAME-tune.log" +fi + +echo "===== Model ${MODEL_NAME_OR_PATH} Inference with Mode ${MODE} ====== " +mode_cmd="" +if [[ ${PRECISION} = 'dynamic_int8' ]]; then + mode_cmd=$mode_cmd" --dynamic_quantize True" + inference_model="./model_and_tokenizer/fp32-model.onnx" +fi +if [[ ${MODE} == "accuracy" ]]; then + echo "------------ACCURACY BENCHMARK---------" + python run_executor.py \ + --input_model=${inference_model} \ + --mode=$MODE \ + --batch_size=${BATCH_SIZE} \ + --seq_len=${SEQUENCE_LEN} \ + --warm_up=${WARM_UP} \ + --iteration=${ITERATION} \ + --dataset_name=glue \ + --task_name=${DATASET} \ + --tokenizer_dir=./model_and_tokenizer \ + ${mode_cmd} 2>&1 | tee "$OUTPUT_DIR/$LOG_NAME-${MODE}-pipeline.log" + status=$? + if [ ${status} != 0 ]; then + echo "Benchmark process returned non-zero exit code." + exit 1 + fi +elif [[ ${MODE} == "latency" ]]; then + echo "------------LATENCY BENCHMARK---------" + python run_executor.py \ + --input_model=${inference_model} \ + --mode="performance" \ + --batch_size=${BATCH_SIZE} \ + --seq_len=${SEQUENCE_LEN} \ + --warm_up=${WARM_UP} \ + --iteration=${ITERATION} \ + --dataset_name=glue \ + --task_name=${DATASET} \ + --tokenizer_dir=./model_and_tokenizer \ + ${mode_cmd} 2>&1 | tee "$OUTPUT_DIR/$LOG_NAME-latency-pipeline.log" + status=$? + if [ ${status} != 0 ]; then + echo "Benchmark process returned non-zero exit code." + exit 1 + fi +elif [[ ${MODE} == "throughput" ]]; then + echo "------------MULTI-INSTANCE BENCHMARK---------" + benchmark_cmd="python run_executor.py --input_model=${inference_model} --mode=performance --batch_size=${BATCH_SIZE} --seq_len=${SEQUENCE_LEN} --warm_up=${WARM_UP} --iteration=${ITERATION} --task_name=${DATASET} --dataset_name=glue --tokenizer_dir=./model_and_tokenizer ${mode_cmd} " + ncores_per_socket=${ncores_per_socket:=$( lscpu | grep 'Core(s) per socket' | cut -d: -f2 | xargs echo -n)} + benchmark_pids=() + ncores_per_instance=4 + export OMP_NUM_THREADS=${ncores_per_instance} + logFile="$OUTPUT_DIR/$LOG_NAME-throughput" + echo "Executing multi instance benchmark" + for((j=0;$j<${ncores_per_socket};j=$(($j + ${ncores_per_instance})))); + do + end_core_num=$((j + ncores_per_instance -1)) + if [ ${end_core_num} -ge ${ncores_per_socket} ]; then + end_core_num=$((ncores_per_socket-1)) + fi + numactl -m 0 -C "$j-$end_core_num" \ + ${benchmark_cmd} 2>&1 | tee ${logFile}-${ncores_per_socket}-${ncores_per_instance}-${j}.log & + benchmark_pids+=($!) + done + + status="SUCCESS" + for pid in "${benchmark_pids[@]}"; do + wait $pid + exit_code=$? + echo "Detected exit code: ${exit_code}" + if [ ${exit_code} == 0 ]; then + echo "Process ${pid} succeeded" + else + echo "Process ${pid} failed" + status="FAILURE" + fi + done + echo "Benchmark process status: ${status}" + if [ ${status} == "FAILURE" ]; then + echo "Benchmark process returned non-zero exit code." + exit 1 + fi +fi + diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/run_executor.py b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/run_executor.py new file mode 100644 index 00000000000..f8cc1499e00 --- /dev/null +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/run_executor.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2021 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from executor_utils import log, Neural_Engine + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--input_model", + default="./model_and_tokenizer/int8-model.onnx", + type=str, + help="Input model path.") + parser.add_argument("--mode", + default="accuracy", + type=str, + help="Benchmark mode of performance or accuracy.") + parser.add_argument("--batch_size", default=8, type=int, help="Batch size.") + parser.add_argument("--seq_len", default=128, type=int, help="Sequence length.") + parser.add_argument("--warm_up", + default=5, + type=int, + help="Warm up iteration in performance mode.") + parser.add_argument("--iteration", default=10, type=int, help="Iteration in performance mode.") + parser.add_argument("--tokenizer_dir", + default="textattack/bert-base-uncased-MRPC", + type=str, + help="Pre-trained model tokenizer name or path") + parser.add_argument("--data_dir", default="./data", type=str, help="Data cache directory.") + parser.add_argument("--dataset_name", default="glue", type=str, help="Name of dataset.") + parser.add_argument("--task_name", default="mrpc", type=str, help="Task name of dataset.") + parser.add_argument("--log_file", + default="executor.log", + type=str, + help="File path to log information.") + parser.add_argument("--dynamic_quantize", + default=False, + type=bool, + help="dynamic quantize for fp32 model.") + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = get_args() + if args.dynamic_quantize: + executor = Neural_Engine(args.input_model, args.log_file, "dynamic_int8") + else: + executor = Neural_Engine(args.input_model, args.log_file, "native") + if args.mode == "accuracy": + executor.accuracy(args.batch_size, args.seq_len, args.dataset_name, args.task_name, + args.data_dir, args.tokenizer_dir) + elif args.mode == "performance": + executor.performance(args.batch_size, args.seq_len, args.iteration, args.warm_up) + else: + log.error("Benchmark only has performance or accuracy mode") diff --git a/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/run_glue.py b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/run_glue.py new file mode 100644 index 00000000000..74ddf0fbd5c --- /dev/null +++ b/examples/huggingface/pytorch/text-classification/deployment/mrpc/bge_large/run_glue.py @@ -0,0 +1,623 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" Finetuning the library models for sequence classification on GLUE.""" +# You can also adapt this script on your own text classification task. Pointers for this are left as comments. + +from mteb import MTEB +from C_MTEB import * +from sentence_transformers import SentenceTransformer +import datasets +import logging +import numpy as np +import os +import random +import sys +import transformers +from dataclasses import dataclass, field +from datasets import load_dataset, load_metric +from intel_extension_for_transformers.transformers import metrics, objectives, OptimizedModel, QuantizationConfig +from intel_extension_for_transformers.transformers.trainer import NLPTrainer + +from transformers import ( + AutoConfig, + AutoModelForSequenceClassification, + AutoTokenizer, + DataCollatorWithPadding, + EvalPrediction, + HfArgumentParser, + PretrainedConfig, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.trainer_utils import get_last_checkpoint +from transformers.utils import check_min_version +from typing import Optional + + +os.environ["CUDA_VISIBLE_DEVICES"] = "" +os.environ["WANDB_DISABLED"] = "true" + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.12.0") + + +task_to_keys = { + "cola": ("sentence", None), + "mnli": ("premise", "hypothesis"), + "mrpc": ("sentence1", "sentence2"), + "qnli": ("question", "sentence"), + "qqp": ("question1", "question2"), + "rte": ("sentence1", "sentence2"), + "sst2": ("sentence", None), + "stsb": ("sentence1", "sentence2"), + "wnli": ("sentence1", "sentence2"), +} + +logger = logging.getLogger(__name__) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + Using `HfArgumentParser` we can turn this class + into argparse arguments to be able to specify them on + the command line. + """ + + task_name: Optional[str] = field( + default=None, + metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())}, + ) + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + max_seq_length: int = field( + default=128, + metadata={ + "help": "The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} + ) + pad_to_max_length: bool = field( + default=True, + metadata={ + "help": "Whether to pad all samples to `max_seq_length`. " + "If False, will pad the samples dynamically when batching to the maximum length in the batch." + }, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_eval_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this " + "value if set." + }, + ) + max_predict_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this " + "value if set." + }, + ) + train_file: Optional[str] = field( + default=None, metadata={"help": "A csv or a json file containing the training data."} + ) + validation_file: Optional[str] = field( + default=None, metadata={"help": "A csv or a json file containing the validation data."} + ) + + def __post_init__(self): + if self.task_name is not None: + self.task_name = self.task_name.lower() + if self.task_name not in task_to_keys.keys(): + raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys())) + elif self.dataset_name is not None: + pass + elif self.train_file is None or self.validation_file is None: + raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.") + else: + train_extension = self.train_file.split(".")[-1] + assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file." + validation_extension = self.validation_file.split(".")[-1] + assert ( + validation_extension == train_extension + ), "`validation_file` should have the same extension (csv or json) as `train_file`." + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. + """ + + model_name_or_path: str = field( + metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + +@dataclass +class OptimizationArguments: + """ + Arguments pertaining to what type of optimization we are going to apply on the model. + """ + + tune: bool = field( + default=False, + metadata={"help": "Whether or not to apply quantization."}, + ) + quantization_approach: Optional[str] = field( + default="PostTrainingStatic", + metadata={"help": "Quantization approach. Supported approach are PostTrainingStatic, " + "PostTrainingDynamic and QuantizationAwareTraining."}, + ) + metric_name: Optional[str] = field( + default=None, + metadata={"help": "Metric used for the tuning strategy."}, + ) + is_relative: Optional[bool] = field( + default=True, + metadata={"help": "Metric tolerance model, expected to be relative or absolute."}, + ) + perf_tol: Optional[float] = field( + default=0.01, + metadata={"help": "Performance tolerance when optimizing the model."}, + ) + benchmark: bool = field( + default=False, + metadata={"help": "run benchmark."}) + benchmark_only: bool = field( + default=False, + metadata={"help": "run benchmark only."}) + int8: bool = field( + default=False, + metadata={"help":"load int8 model."}) + enable_bf16: bool = field( + default=False, + metadata={"help":"Whether or not to apply bf16."}) + accuracy_only: bool = field( + default=False, + metadata={"help":"Whether to only test accuracy for model tuned by Neural Compressor."}) + to_onnx: bool = field( + default=False, + metadata={"help":"Transfer pytorch model to onnx model."}) + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, OptimizationArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args, optim_args = parser.parse_json_file( + json_file=os.path.abspath(sys.argv[1]) + ) + else: + model_args, data_args, training_args, optim_args = parser.parse_args_into_dataclasses() + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + + log_level = training_args.get_process_log_level() + logger.setLevel(log_level) + datasets.utils.logging.set_verbosity(log_level) + transformers.utils.logging.set_verbosity(log_level) + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"\ndistributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + logger.info(f"Training/evaluation parameters {training_args}") + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) + # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the + # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named + # label if at least two columns are provided. + # + # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this + # single column. You can easily tweak this behavior (see below) + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.task_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir) + elif data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + raw_datasets = load_dataset( + data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir + ) + else: + # Loading a dataset from your local files. + # CSV/JSON training and evaluation files are needed. + data_files = {"train": data_args.train_file, "validation": data_args.validation_file} + + for key in data_files.keys(): + logger.info(f"load a local file for {key}: {data_files[key]}") + + if data_args.train_file.endswith(".csv"): + # Loading a dataset from local csv files + raw_datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) + else: + # Loading a dataset from local json files + raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) + # See more about loading any type of standard or custom dataset at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Labels + if data_args.task_name is not None: + is_regression = data_args.task_name == "stsb" + if not is_regression: + label_list = raw_datasets["train"].features["label"].names + num_labels = len(label_list) + else: + num_labels = 1 + else: + # Trying to have good defaults here, don't hesitate to tweak to your needs. + is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"] + if is_regression: + num_labels = 1 + else: + # A useful fast method: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique + label_list = raw_datasets["train"].unique("label") + label_list.sort() # Let's sort it for determinism + num_labels = len(label_list) + + # Load pretrained model and tokenizer + # + # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + num_labels=num_labels, + finetuning_task=data_args.task_name, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + if optim_args.int8: + # Load the model obtained after Intel Neural Compressor (INC) quantization + model = OptimizedModel.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + model = AutoModelForSequenceClassification.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + # Preprocessing the raw_datasets + if data_args.task_name is not None: + sentence1_key, sentence2_key = task_to_keys[data_args.task_name] + else: + # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. + non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"] + if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: + sentence1_key, sentence2_key = "sentence1", "sentence2" + else: + if len(non_label_column_names) >= 2: + sentence1_key, sentence2_key = non_label_column_names[:2] + else: + sentence1_key, sentence2_key = non_label_column_names[0], None + + # Padding strategy + if data_args.pad_to_max_length: + padding = "max_length" + else: + # We will pad later, dynamically at batch creation, to the max sequence length in each batch + padding = False + + # Some models have set the order of the labels to use, so let's make sure we do use it. + label_to_id = None + if ( + model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id + and data_args.task_name is not None + and not is_regression + ): + # Some have all caps in their config, some don't. + label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()} + if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): + label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)} + else: + logger.warning( + f"Your model seems to have been trained with labels, but they don't match the dataset: " + f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}.\n" + f"Ignoring the model labels as a result." + ) + elif data_args.task_name is None and not is_regression: + label_to_id = {v: i for i, v in enumerate(label_list)} + + if label_to_id is not None: + model.config.label2id = label_to_id + model.config.id2label = {id: label for label, id in config.label2id.items()} + + if data_args.max_seq_length > tokenizer.model_max_length: + logger.warning( + f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" + f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." + ) + max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) + + def preprocess_function(examples): + # Tokenize the texts + args = ( + (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) + ) + result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) + + # Map labels to IDs (not necessary for GLUE tasks) + if label_to_id is not None and "label" in examples: + result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]] + return result + + with training_args.main_process_first(desc="dataset map pre-processing"): + raw_datasets = raw_datasets.map( + preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache + ) + if training_args.do_train: + if "train" not in raw_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = raw_datasets["train"] + if data_args.max_train_samples is not None: + train_dataset = train_dataset.select(range(data_args.max_train_samples)) + + if training_args.do_eval: + if "validation" not in raw_datasets and "validation_matched" not in raw_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] + if data_args.max_eval_samples is not None: + eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) + + # Log a few random samples from the training set: + if training_args.do_train: + for index in random.sample(range(len(train_dataset)), 3): + logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") + + # Get the metric function + if data_args.task_name is not None: + metric = load_metric("glue", data_args.task_name) + else: + metric = load_metric("accuracy") + + # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a + # predictions and label_ids field) and has to return a dictionary string to float. + def compute_metrics(p: EvalPrediction): + preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions + preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) + if data_args.task_name is not None: + result = metric.compute(predictions=preds, references=p.label_ids) + if len(result) > 1: + result["combined_score"] = np.mean(list(result.values())).item() + return result + elif is_regression: + return {"mse": ((preds - p.label_ids) ** 2).mean().item()} + else: + return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()} + + # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. + if data_args.pad_to_max_length: + data_collator = default_data_collator + elif training_args.fp16: + data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) + else: + data_collator = None + + metric_name = ( + optim_args.metric_name + if optim_args.metric_name is not None + else "eval_" + + ( + "pearson" + if data_args.task_name == "stsb" + else "matthews_correlation" + if data_args.task_name == "cola" + else "accuracy" + ) + ) + training_args.metric_for_best_model = metric_name + + # Initialize our Trainer + dataset_id = "SetFit/amazon_counterfactual" + train_dataset = datasets.load_dataset(dataset_id, name="en")['train'] + + def preprocess_function(examples): + # Tokenize the texts + args = examples['text'] + result= tokenizer(args, padding=padding, max_length=max_seq_length, truncation=True) + + return result + + with training_args.main_process_first(desc="dataset map pre-processing"): + train_dataset = train_dataset.map( + preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache + ) + + trainer = NLPTrainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + compute_metrics=compute_metrics, + tokenizer=tokenizer, + data_collator=data_collator, + ) + + + if optim_args.tune: + + if not training_args.do_eval: + raise ValueError("do_eval must be set to True for quantization.") + + if optim_args.quantization_approach != "PostTrainingDynamic": + if not training_args.do_train: + raise ValueError( + "do_train must be set to True for static and aware training quantization." + ) + if optim_args.quantization_approach == "QuantizationAwareTraining": + early_stopping_patience = 6 + early_stopping_threshold = 0.001 # optional + trainer.add_callback(transformers.EarlyStoppingCallback(early_stopping_patience, + early_stopping_threshold)) + + tune_metric = metrics.Metric( + name=metric_name, is_relative=optim_args.is_relative, criterion=optim_args.perf_tol + ) + objective = objectives.performance + quantization_config = QuantizationConfig( + approach=optim_args.quantization_approach, + max_trials=600, + metrics=[tune_metric], + objectives=[objective], + sampling_size = len(train_dataset)//20 + ) + + stmodel = SentenceTransformer(model_args.model_name_or_path) + def eval_func(model): + stmodel[0].auto_model = model.bert + evaluation = MTEB(task_langs=['en'], tasks=['AmazonCounterfactualClassification']) + results = evaluation.run(stmodel, overwrite_results=True) + print(results) + return results['AmazonCounterfactualClassification']['test']['en']['accuracy'] + model = trainer.quantize( + quant_config=quantization_config, + eval_func=eval_func, + ) + + if optim_args.benchmark_only: + model_path = model_args.model_name_or_path + # to avoid wrong architecture from model name (only work for fp32, like bert-base-uncased). + if 'SequenceClassification' not in config.architectures[0]: + model_path = model + trainer.benchmark( + model_path, + batch_size=training_args.per_device_eval_batch_size, + cores_per_instance=optim_args.cores_per_instance, + num_of_instance=optim_args.num_of_instance, + ) + + if optim_args.benchmark or optim_args.accuracy_only: + + results = trainer.evaluate() + logger.info("metrics keys: {}".format(results.keys())) + bert_task_acc_keys = ['eval_f1', 'eval_accuracy', 'eval_matthews_correlation', + 'eval_pearson', 'eval_mcc', 'eval_spearmanr'] + ret = False + for key in bert_task_acc_keys: + if key in results.keys(): + ret = True + throughput = results.get("eval_samples_per_second") + print('Batch size = {}'.format(training_args.per_device_eval_batch_size)) + print("Finally Eval {} Accuracy: {:.5f}".format(key, results[key])) + print("Latency: {:.5f} ms".format(1000 / throughput)) + print("Throughput: {:.5f} samples/sec".format(throughput)) + break + assert ret, "No metric returned, Please check inference metric!" + + if optim_args.enable_bf16: + trainer.enable_bf16 = True + if optim_args.to_onnx: + trainer.enable_executor = True + trainer.export_to_onnx() + +if __name__ == "__main__": + main()