[Engine]QAT Stable Diffusiomn Enabling (#95)

intel · Aug 10, 2023 · 02cc596 · 02cc596
1 parent 76ee683
commit 02cc596
Show file tree

Hide file tree

Showing 26 changed files with 1,572 additions and 196 deletions.
diff --git a/examples/.config/engine_deploy.json b/examples/.config/engine_deploy.json
@@ -520,6 +520,28 @@
     "launcher": {
     }
   },
+  "stable_diffusion_v1_5_qat": {
+    "working_dir": "huggingface/pytorch/text-to-image/deployment/stable_diffusion",
+    "data_dir": "",
+    "hf_model_name": "runwayml/stable-diffusion-v1-5",
+    "prepare": {
+      "cmd": "bash -x export_model.sh",
+      "params": {
+         "input_model": "/tf_dataset2/models/nlp_toolkit/stable-diffusion-v1-5-qat",
+         "precision": "qat_int8"
+      }
+    },
+    "benchmark": {
+      "cmd": "python run_executor.py",
+      "params": {
+        "ir_path": "_ir",
+        "mode": "benchmark",
+        "input_model" : "runwayml/stable-diffusion-v1-5"
+      }
+    },
+    "launcher": {
+    }
+  },
   "stable_diffusion_v2_1": {
     "working_dir": "huggingface/pytorch/text-to-image/deployment/stable_diffusion",
     "data_dir": "",

diff --git a/examples/huggingface/pytorch/text-to-image/deployment/stable_diffusion/README.md b/examples/huggingface/pytorch/text-to-image/deployment/stable_diffusion/README.md
@@ -36,6 +36,8 @@ pip install -v .
 Install required dependencies for this example
 ```shell
 cd <intel_extension_for_transformers_folder>/examples/huggingface/pytorch/text-to-image/deployment/stable_diffusion
+
+# Please update the requirements.txt manually to install the transformers==4.28.1
 pip install -r requirements.txt
 ```
 >**Note**: Recommend install protobuf <= 3.20.0 if use onnxruntime <= 1.11
@@ -74,41 +76,58 @@ By setting --bf16 to export FP32 and BF16 models.
 python prepare_model.py --input_model=CompVis/stable-diffusion-v1-4 --output_path=./model --bf16
 ```
 
+By setting --qat_int8 to export INT8 models, **only support runwayml/stable-diffusion-v1-5**.
+
+**NOTE**: You should get a fake_quantized_model_qinit.pt before prepare qat models. Please refer the [link](https://github.com/intel/intel-extension-for-transformers/blob/main/examples/huggingface/pytorch/text-to-image/quantization/qat/README.md).
+```python
+# Don't forget to get a fake_quantized_model_qinit.pt first.
+python prepare_model.py --input_model=runwayml/stable-diffusion-v1-5 --output_path=./model --qat_int8
+```
+
 ### 1.2 Compile Models
 
-Export three FP32 onnx sub models of the stable diffusion to Nerual Engine IRs.
+Export three FP32 onnx sub models of the stable diffusion to Nerual Engine IR.
 
 ```bash
-# running the follow bash comand to get all IRs.
+# running the follow bash comand to get all IR.
 bash export_model.sh --input_model=model --precision=fp32
 ```
 
-Export three BF16 onnx sub models of the stable diffusion to Nerual Engine IRs.
+Export three BF16 onnx sub models of the stable diffusion to Nerual Engine IR.
 
 ```bash
-# running the follow bash comand to get all IRs.
+# running the follow bash comand to get all IR.
 bash export_model.sh --input_model=model --precision=bf16
 ```
 
-Export mixed FP32 & dynamic quantized Int8 IRs.
+Export mixed FP32 & dynamic quantized Int8 IR.
 
 ```bash
-# running the follow comand to get mixed FP32 & dynamic quantized Int8 IRs.
+# running the follow comand to get mixed FP32 & dynamic quantized Int8 IR.
 bash export_model.sh --input_model=model --precision=fp32 --cast_type=dynamic_int8
 ```
 
+Export mixed BF16 & QAT quantized Int8 IR.
+```bash
+# running the follow comand to get mixed BF16 & QAT quantized Int8 IR.
+bash export_model.sh --input_model=model --precision=qat_int8
+```
+
 ## 2. Performance
 
 Python API command as follows:
 ```python
 # FP32 IR
 python run_executor.py --ir_path=./fp32_ir --mode=latency --input_model=CompVis/stable-diffusion-v1-4
 
-# mixed FP32 & dynamic quantized Int8 IRs.
+# mixed FP32 & dynamic quantized Int8 IR.
 python run_executor.py --ir_path=./fp32_dynamic_int8_ir --mode=latency --input_model=CompVis/stable-diffusion-v1-4
 
 # BF16 IR
 python run_executor.py --ir_path=./bf16_ir --mode=latency --input_model=CompVis/stable-diffusion-v1-4
+
+# QAT INT8 IR
+python run_executor.py --ir_path=./qat_int8_ir --mode=latency --input_model=runwayml/stable-diffusion-v1-5
 ```
 
 ## 3. Accuracy
@@ -120,19 +139,22 @@ Python API command as follows:
 # FP32 IR
 python run_executor.py --ir_path=./fp32_ir --mode=accuracy --input_model=CompVis/stable-diffusion-v1-4
 
-# mixed FP32 & dynamic quantized Int8 IRs
+# mixed FP32 & dynamic quantized Int8 IR
 python run_executor.py --ir_path=./fp32_dynamic_int8_ir --mode=accuracy --input_model=CompVis/stable-diffusion-v1-4
 
 # BF16 IR
 python run_executor.py --ir_path=./bf16_ir --mode=accuracy --input_model=CompVis/stable-diffusion-v1-4
+
+# QAT INT8 IR
+python run_executor.py --ir_path=./qat_int8_ir --mode=accuracy --input_model=runwayml/stable-diffusion-v1-5
 ```
 
 ## 4. Try Text to Image
 
 Try using one sentence to create a picture!
 
 ```python
-# Running FP32 models or BF16 models, just import differnt IRs.
+# Running FP32 models or BF16 models, just import differnt IR.
 # FP32 models
 python run_executor.py --ir_path=./fp32_ir --input_model=CompVis/stable-diffusion-v1-4
 ```
@@ -147,7 +169,7 @@ python run_executor.py --ir_path=./bf16_ir --input_model=CompVis/stable-diffusio
 > Note: 
 > 1. The default pretrained model is "CompVis/stable-diffusion-v1-4".
 > 2. The default prompt is "a photo of an astronaut riding a horse on mars" and the default output name is "astronaut_rides_horse.png".
-> 3. The ir directory should include three IRs for text_encoder, unet and vae_decoder.
+> 3. The ir directory should include three IR for text_encoder, unet and vae_decoder.
 
 ## 5. Validated Result
 

diff --git a/examples/huggingface/pytorch/text-to-image/deployment/stable_diffusion/diffusion_utils.py b/examples/huggingface/pytorch/text-to-image/deployment/stable_diffusion/diffusion_utils.py
@@ -669,7 +669,7 @@ def neural_engine_init(ir_path):
     uent_path = ir_path + '/unet/'
     unet_conf = uent_path + 'conf.yaml'
     unet_bin = uent_path + 'model.bin'
-    unet_graph.graph_init(unet_conf, unet_bin)
+    unet_graph.graph_init(unet_conf, unet_bin, True)
 
     vae_decoder_graph = Graph()
     vae_decoder_path = ir_path + '/vae_decoder/'

diff --git a/examples/huggingface/pytorch/text-to-image/deployment/stable_diffusion/export_ir.py b/examples/huggingface/pytorch/text-to-image/deployment/stable_diffusion/export_ir.py
@@ -72,6 +72,8 @@
         'StableDiffusion_MHAReshape': True,
         'StableDiffusion_MHA': False,
         'ExplicitNHWCTransposeForConv': True,
+        'ExplicitNHWCTransposeForConvQAT': False,
+        'MultiHeadAttention': False,
 
         # Channel_last
         'ConvReshape': False
@@ -132,6 +134,78 @@
         'StableDiffusion_MHAReshape': True,
         'StableDiffusion_MHA': False,
         'ExplicitNHWCTransposeForConv': True,
+        'ExplicitNHWCTransposeForConvQAT': False,
+        'MultiHeadAttention': False,
+
+        # Channel_last
+        'ConvReshape': False
+    }
+}
+
+qat_unet_pattern_config = {
+    'pattern_switch': {
+        # General Pattern
+        'PaddingSequence': False,
+        'AttentionReshape': False,
+        'QKVReshape': False,
+        'ReshapeFusion': False,
+        'InsertBF16Node': False,
+        'OperatorAdaptor': False,
+
+        # transpose_int8
+        'QKVMerge': False,
+
+        # 'TextEncoder
+        'TextEncoder_WordEmbedding': False,
+        'TextEncoder_QReshape': False,
+        'TextEncoder_KVReshape': False,
+        'TextEncoder_AttentionMaskAddReshape': False,
+        'TextEncoder_SoftmaxReshape': False,
+        'TextEncoder_MulReshape': False,
+        'TextEncoder_AttentionReshape': False,
+        'TextEncoder_CasualAttentionMask': False,
+
+        # for unet and vae decoder
+        'GroupNorm': True,
+
+        # vae deocder & Transformer2Dmodel
+        'AttentionBlock_Resize2Gather': True,
+        'AttentionBlock_QKVPreReshape': True,
+        'AttentionBlock_AttentionMaskAddReshape': True,
+        'AttentionBlock_ConstantOfShapeWithMul': True,
+
+        'Transformer2Dmodel_GetSampleBatch': True,
+        'Transformer2Dmodel_SampleSlice': True,
+        'Transformer2Dmodel_EncoderHiddenStatesReshape': True,
+        'Transformer2Dmodel_ConstantOfShapeWithMul': True,
+        'Transformer2Dmodel_QKVPreReshape': True,
+        'Transformer2Dmodel_QKVReshape': True,
+        'AttentionBlock_QKVReshape': False,
+        'Transformer2Dmodel_QKVReshapeTo4D': True,
+        'Transformer2Dmodel_AttentionMaskAddReshape': True,
+        'Transformer2Dmodel_FFNInputSlice': True,
+        'Transformer2Dmodel_FFNInputSlice_1': True,
+        'Transformer2DModel_UpBlockResize': True,
+
+        # for all stable diffusion models
+        'StableDiffusion_bf16Convert': True,
+        'StableDiffusion_ReshapeFusion': True,
+
+        # MHA
+        'TorchInsertBF16Node': False,
+        'StableDiffusion_MHAReshape': True,
+        'StableDiffusion_MHA': True,
+        'ExplicitNHWCTransposeForConv': False,
+        'ExplicitNHWCTransposeForConvQAT': True,
+        'MultiHeadAttention': False,
+
+        # QAT for the stable diffusion
+        'StableDiffusion_InsertQuantNode': True,
+        'StableDiffusion_CollectQuantInfo': True,
+        'CollectQuantInfo': False,
+        'InsertQuantNode': False,
+        'QuantizeFusion': False,
+        'StableDiffusion_QuantizeFusion': True,
 
         # Channel_last
         'ConvReshape': False
@@ -192,6 +266,8 @@
         'StableDiffusion_MHAReshape': True,
         'StableDiffusion_MHA': False,
         'ExplicitNHWCTransposeForConv': True,
+        'ExplicitNHWCTransposeForConvQAT': False,
+        'MultiHeadAttention': False,
 
         # Channel_last
         'ConvReshape': False
@@ -225,6 +301,11 @@
         with autocast(args.dtype):
             graph = compile(args.onnx_model, args.pattern_config)
             graph.save(args.output_path)
+    elif args.dtype == "qat_int8":
+        args.pattern_config = qat_unet_pattern_config
+        with autocast(args.dtype):
+            graph = compile(args.onnx_model, args.pattern_config)
+            graph.save(args.output_path)
     else:
         graph = compile(args.onnx_model, args.pattern_config)
         graph.save(args.output_path)
diff --git a/examples/huggingface/pytorch/text-to-image/deployment/stable_diffusion/export_model.sh b/examples/huggingface/pytorch/text-to-image/deployment/stable_diffusion/export_model.sh
@@ -47,6 +47,23 @@ python export_ir.py --onnx_model=${input_model}/vae_decoder_fp32/model.onnx --pa
 exit
 fi
 
+if [[ ${precision} == 'qat_int8' ]]; then
+cast_type=qat_int8
+echo "[INFO] cast_type is qat int8"
+# 1. text encoder
+echo "[INFO] Start to export text encoder bf16 ir..."
+python export_ir.py --onnx_model=${input_model}/text_encoder_bf16/model.onnx --pattern_config=text_encoder --output_path=./${cast_type}_ir/text_encoder/ --dtype=bf16
+
+# 2. unet
+echo "[INFO] Start to export unet qat int8 ir..."
+python export_ir.py --onnx_model=${input_model}/unet_${cast_type}/model.onnx --pattern_config=unet --output_path=./${cast_type}_ir/unet/ --dtype=${cast_type}
+
+# 3. vae_decoder
+echo "[INFO] start to export vae_decoder bf16 ir..."
+python export_ir.py --onnx_model=${input_model}/vae_decoder_bf16/model.onnx --pattern_config=vae_decoder --output_path=./${cast_type}_ir/vae_decoder/ --dtype=bf16
+exit
+fi
+
 # 1. text encoder
 echo "[INFO] Start to export text encoder ir..."
 python export_ir.py --onnx_model=${input_model}/text_encoder_${precision}/model.onnx --pattern_config=text_encoder --output_path=./${precision}_ir/text_encoder/ --dtype=${precision}