From 0e2f2f6e2f5322e8741d523317872eafa32e47c1 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Mon, 18 Nov 2019 13:18:34 +0800
Subject: [PATCH 01/29] init support for int8 bert classification

---
 scripts/bert/data/transform.py        |  5 +-
 scripts/bert/finetune_classifier.py   | 80 ++++++++++++++++++++++++---
 src/gluonnlp/calibration/__init__.py  |  2 +
 src/gluonnlp/calibration/collector.py | 43 ++++++++++++++
 4 files changed, 120 insertions(+), 10 deletions(-)
 create mode 100644 src/gluonnlp/calibration/__init__.py
 create mode 100644 src/gluonnlp/calibration/collector.py

diff --git a/scripts/bert/data/transform.py b/scripts/bert/data/transform.py
index be8cabc6b4..d8bef6efc2 100644
--- a/scripts/bert/data/transform.py
+++ b/scripts/bert/data/transform.py
@@ -124,6 +124,7 @@ def __call__(self, line):
             if self.class_labels:
                 label = self._label_map[label]
             label = np.array([label], dtype=self._label_dtype)
-            return input_ids, valid_length, segment_ids, label
+            return input_ids, segment_ids, valid_length, label
         else:
-            return self._bert_xform(line)
+            input_ids, valid_length, segment_ids = self._bert_xform(line)
+            return input_ids, segment_ids, valid_length
diff --git a/scripts/bert/finetune_classifier.py b/scripts/bert/finetune_classifier.py
index b4e89ee35b..3babe3bcbd 100644
--- a/scripts/bert/finetune_classifier.py
+++ b/scripts/bert/finetune_classifier.py
@@ -43,9 +43,11 @@
 import mxnet as mx
 from mxnet import gluon
 from mxnet.contrib.amp import amp
+from mxnet.contrib.quantization import *
 import gluonnlp as nlp
 from gluonnlp.data import BERTTokenizer
 from gluonnlp.model import BERTClassifier, RoBERTaClassifier
+from gluonnlp.calibration import BertLayerCollector
 
 from data.classification import MRPCTask, QQPTask, RTETask, STSBTask, SSTTask
 from data.classification import QNLITask, CoLATask, MNLITask, WNLITask, XNLITask
@@ -178,6 +180,20 @@
     default=None,
     help='Whether to perform early stopping based on the metric on dev set. '
          'The provided value is the patience. ')
+parser.add_argument('--deploy', action='store_true',
+                    help='whether load static model for deployment')
+parser.add_argument('--model_prefix', type=str, required=False,
+                    help='load static model as hybridblock.')
+parser.add_argument('--only_calibration', action='store_true',
+                    help='quantize model')
+parser.add_argument('--num_calib_batches', type=int, default=5,
+                    help='number of batches for calibration')
+parser.add_argument('--quantized_dtype', type=str, default='auto', 
+                    choices=['auto', 'int8', 'uint8'],
+                    help='quantization destination data type for input data')
+parser.add_argument('--calib_mode', type=str, default='customize',
+                    choices=['none', 'naive', 'entropy', 'customize'],
+                    help='calibration mode used for generating calibration table for the quantized symbol.')
 
 args = parser.parse_args()
 
@@ -222,6 +238,11 @@
 dataset = args.bert_dataset
 pretrained_bert_parameters = args.pretrained_bert_parameters
 model_parameters = args.model_parameters
+
+# load symbolic model
+deploy = args.deploy
+model_prefix = args.model_prefix
+
 if only_inference and not model_parameters:
     warnings.warn('model_parameters is not set. '
                   'Randomly initialized model will be used for inference.')
@@ -275,6 +296,11 @@
     nlp.utils.load_parameters(model, model_parameters, ctx=ctx, cast_dtype=True)
 nlp.utils.mkdir(output_dir)
 
+if deploy:
+    logging.info('load symbol file directly as SymbolBlock for model deployment')
+    model = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
+            ['data0', 'data1', 'data2'], '{}-0000.params'.format(args.model_prefix))
+
 logging.debug(model)
 model.hybridize(static_alloc=True)
 loss_function.hybridize(static_alloc=True)
@@ -286,6 +312,12 @@
 else:
     bert_tokenizer = BERTTokenizer(vocabulary, lower=do_lower_case)
 
+# calibration config
+only_calibration = args.only_calibration
+num_calib_batches = args.num_calib_batches
+quantized_dtype = args.quantized_dtype
+calib_mode = args.calib_mode
+
 def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab, pad=False):
     """Train/eval Data preparation function."""
     # transformation for data train and dev
@@ -302,13 +334,13 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab,
     train_tsv = task.dataset_train()[1]
     data_train = mx.gluon.data.SimpleDataset(list(map(trans, train_tsv)))
     data_train_len = data_train.transform(
-        lambda input_id, length, segment_id, label_id: length, lazy=False)
+        lambda input_id, segment_id, length, label_id: length, lazy=False)
     # bucket sampler for training
     pad_val = vocabulary[vocabulary.padding_token]
     batchify_fn = nlp.data.batchify.Tuple(
         nlp.data.batchify.Pad(axis=0, pad_val=pad_val), # input
-        nlp.data.batchify.Stack(),                      # length
         nlp.data.batchify.Pad(axis=0, pad_val=0),       # segment
+        nlp.data.batchify.Stack(),                      # length
         nlp.data.batchify.Stack(label_dtype))           # label
     batch_sampler = nlp.data.sampler.FixedBucketSampler(
         data_train_len,
@@ -339,8 +371,9 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab,
 
     # batchify for data test
     test_batchify_fn = nlp.data.batchify.Tuple(
-        nlp.data.batchify.Pad(axis=0, pad_val=pad_val), nlp.data.batchify.Stack(),
-        nlp.data.batchify.Pad(axis=0, pad_val=0))
+        nlp.data.batchify.Pad(axis=0, pad_val=pad_val),
+        nlp.data.batchify.Pad(axis=0, pad_val=0),
+        nlp.data.batchify.Stack())
     # transform for data test
     test_trans = BERTDatasetTransform(tokenizer, max_len,
                                       vocab=vocab,
@@ -369,6 +402,30 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab,
 train_data, dev_data_list, test_data_list, num_train_examples = preprocess_data(
     bert_tokenizer, task, batch_size, dev_batch_size, args.max_len, vocabulary, args.pad)
 
+def calibration(net, dev_data_list, num_calib_batches, quantized_dtype, calib_mode):
+    """calibration function on the dev dataset."""
+    assert len(dev_data_list) == 1, \
+        "Currectly, MNLI not supported."
+    assert ctx == mx.cpu(), \
+        "Currently only supports CPU with MKL-DNN backend."
+    logging.info('Now we are doing calibration on dev with %s.', ctx)
+    for _, dev_data in dev_data_list:
+        collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=logging)
+        net = quantize_net(net, quantized_dtype=quantized_dtype,
+                           exclude_layers=[],
+                           exclude_layers_match=['elemwise_add'],
+                           calib_data=dev_data,
+                           calib_mode=calib_mode,
+                           num_calib_examples=dev_batch_size * num_calib_batches,
+                           ctx=ctx,
+                           LayerOutputCollector=collector,
+                           logger=logging)
+        # save params
+        ckpt_name = 'model_bert_{0}_quantized_{1}'.format(task_name, calib_mode)
+        params_saved = os.path.join(output_dir, ckpt_name)
+        net.export(params_saved, epoch=0)
+        logging.info('Saving quantized model at %s' % output_dir)
+
 
 def test(loader_test, segment):
     """Inference function on the test dataset."""
@@ -377,7 +434,7 @@ def test(loader_test, segment):
     tic = time.time()
     results = []
     for _, seqs in enumerate(loader_test):
-        input_ids, valid_length, segment_ids = seqs
+        input_ids, segment_ids, valid_length = seqs
         input_ids = input_ids.as_in_context(ctx)
         valid_length = valid_length.as_in_context(ctx).astype('float32')
         if use_roberta:
@@ -490,7 +547,7 @@ def train(metric):
 
                 # forward and backward
                 with mx.autograd.record():
-                    input_ids, valid_length, segment_ids, label = seqs
+                    input_ids, segment_ids, valid_length, label = seqs
                     input_ids = input_ids.as_in_context(ctx)
                     valid_length = valid_length.as_in_context(ctx).astype('float32')
                     label = label.as_in_context(ctx)
@@ -568,7 +625,7 @@ def evaluate(loader_dev, metric, segment):
     step_loss = 0
     tic = time.time()
     for batch_id, seqs in enumerate(loader_dev):
-        input_ids, valid_length, segment_ids, label = seqs
+        input_ids, segment_ids, valid_length, label = seqs
         input_ids = input_ids.as_in_context(ctx)
         valid_length = valid_length.as_in_context(ctx).astype('float32')
         label = label.as_in_context(ctx)
@@ -599,4 +656,11 @@ def evaluate(loader_dev, metric, segment):
 
 
 if __name__ == '__main__':
-    train(task.metrics)
+    if only_calibration:
+        calibration(model,
+                    dev_data_list,
+                    num_calib_batches,
+                    quantized_dtype,
+                    calib_mode)
+    else:
+        train(task.metrics)
diff --git a/src/gluonnlp/calibration/__init__.py b/src/gluonnlp/calibration/__init__.py
new file mode 100644
index 0000000000..056b65d1c9
--- /dev/null
+++ b/src/gluonnlp/calibration/__init__.py
@@ -0,0 +1,2 @@
+from . import collector
+from .collector import *
\ No newline at end of file
diff --git a/src/gluonnlp/calibration/collector.py b/src/gluonnlp/calibration/collector.py
new file mode 100644
index 0000000000..ef3769170a
--- /dev/null
+++ b/src/gluonnlp/calibration/collector.py
@@ -0,0 +1,43 @@
+
+import ctypes
+import numpy as np
+from mxnet import ndarray
+from mxnet.base import NDArrayHandle, py_str
+from mxnet.ndarray import NDArray
+
+class BertLayerCollector(object):
+    """Saves layer output min and max values in a dict with layer names as keys.
+    The collected min and max values will be directly used as thresholds for quantization.
+    """
+    def __init__(self, clip_min=None, clip_max=None, logger=None):
+        self.include_layer = lambda name: name.endswith('_output') or \
+                                        name.endswith('reshape10_0') or \
+                                        name.endswith('_mul0_0') or \
+                                        name.endswith('_squeeze0_0')
+        self.min_max_dict = {}
+        self.clip_min = clip_min
+        self.clip_max = clip_max
+        self.logger = logger
+
+    def collect(self, name, arr):
+        """Callback function for collecting min and max values from an NDArray."""
+        name = py_str(name)
+        if self.include_layer is not None and not self.include_layer(name):
+            return
+        handle = ctypes.cast(arr, NDArrayHandle)
+        arr = NDArray(handle, writable=False)
+        min_range = ndarray.min(arr).asscalar()
+        max_range = ndarray.max(arr).asscalar()
+        if name.find('gelu0__mul0') != -1 and max_range > self.clip_max:
+            max_range = self.clip_max
+        if name.find('bertlayernorm0_layernorm0') != -1 and min_range < self.clip_min:
+            min_range = self.clip_min
+        if name in self.min_max_dict:
+            cur_min_max = self.min_max_dict[name]
+            self.min_max_dict[name] = (min(cur_min_max[0], min_range),
+                                       max(cur_min_max[1], max_range))
+        else:
+            self.min_max_dict[name] = (min_range, max_range)
+        if self.logger is not None:
+            self.logger.info("Collecting layer %s min_range=%f, max_range=%f"
+                             % (name, min_range, max_range))
\ No newline at end of file

From 710c6ce89e633ba2a5e42853daacfe3b5fb8e54c Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Mon, 18 Nov 2019 14:56:00 +0800
Subject: [PATCH 02/29] support squad calibration

---
 scripts/bert/data/qa.py        |  12 +++-
 scripts/bert/finetune_squad.py | 110 ++++++++++++++++++++++++++++++++-
 2 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/scripts/bert/data/qa.py b/scripts/bert/data/qa.py
index e6ef2294b4..44dc1d9c9d 100644
--- a/scripts/bert/data/qa.py
+++ b/scripts/bert/data/qa.py
@@ -52,7 +52,7 @@ def _worker_fn(example, transform):
     return feature
 
 
-def preprocess_dataset(dataset, transform, num_workers=8):
+def preprocess_dataset(dataset, transform, num_workers=8, for_calibration=False):
     """Use multiprocessing to perform transform for dataset.
 
     Parameters
@@ -77,8 +77,14 @@ def preprocess_dataset(dataset, transform, num_workers=8):
                 dataset_transform.append(_data[:-1])
                 dataset_len.append(_data[-1])
 
-    dataset = SimpleDataset(dataset_transform).transform(
-        lambda x: (x[0], x[1], x[2], x[3], x[4], x[5]))
+    if for_calibration:
+        # gluon calibration api supposes there must be input datas and one label per data entry.
+        dataset = SimpleDataset(dataset_transform).transform(
+            lambda x: (x[1], x[2], x[3], x[4]))
+    else:
+        dataset = SimpleDataset(dataset_transform).transform(
+            lambda x: (x[0], x[1], x[2], x[3], x[4], x[5]))
+
     end = time.time()
     pool.close()
     print('Done! Transform dataset costs %.2f seconds.' % (end-start))
diff --git a/scripts/bert/finetune_squad.py b/scripts/bert/finetune_squad.py
index 16c8615853..08c64510ec 100644
--- a/scripts/bert/finetune_squad.py
+++ b/scripts/bert/finetune_squad.py
@@ -44,11 +44,13 @@
 
 import numpy as np
 import mxnet as mx
+from mxnet.contrib.quantization import *
 
 import gluonnlp as nlp
 from gluonnlp.data import SQuAD
+from gluonnlp.calibration import BertLayerCollector
 from model.qa import BertForQALoss, BertForQA
-from data.qa import SQuADTransform, preprocess_dataset
+from data.qa import SQuADTransform, preprocess_dataset, preprocess_calib_dataset
 from bert_qa_evaluate import get_F1_EM, predict, PredResult
 
 np.random.seed(6)
@@ -197,6 +199,26 @@
                     action='store_true',
                     help='Run the example in test mode for sanity checks')
 
+parser.add_argument('--deploy', action='store_true',
+                    help='whether load static model for deployment')
+
+parser.add_argument('--model_prefix', type=str, required=False,
+                    help='load static model as hybridblock.')
+
+parser.add_argument('--only_calibration', action='store_true',
+                    help='quantize model')
+
+parser.add_argument('--num_calib_batches', type=int, default=5,
+                    help='number of batches for calibration')
+
+parser.add_argument('--quantized_dtype', type=str, default='auto', 
+                    choices=['auto', 'int8', 'uint8'],
+                    help='quantization destination data type for input data')
+
+parser.add_argument('--calib_mode', type=str, default='customize',
+                    choices=['none', 'naive', 'entropy', 'customize'],
+                    help='calibration mode used for generating calibration table for the quantized symbol.')
+
 args = parser.parse_args()
 
 output_dir = args.output_dir
@@ -289,6 +311,10 @@
     nlp.data.batchify.Stack('float32'),
     nlp.data.batchify.Stack('float32'))
 
+# load symbolic model
+deploy = args.deploy
+model_prefix = args.model_prefix
+
 net = BertForQA(bert=bert)
 if model_parameters:
     # load complete BertForQA parameters
@@ -305,11 +331,21 @@
     # no checkpoint is loaded
     net.initialize(init=mx.init.Normal(0.02), ctx=ctx)
 
+if deploy:
+    logging.info('load symbol file directly as SymbolBlock for model deployment')
+    net = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
+            ['data0', 'data1', 'data2'], '{}-0000.params'.format(args.model_prefix))
+
 net.hybridize(static_alloc=True)
 
 loss_function = BertForQALoss()
 loss_function.hybridize(static_alloc=True)
 
+# calibration config
+only_calibration = args.only_calibration
+num_calib_batches = args.num_calib_batches
+quantized_dtype = args.quantized_dtype
+calib_mode = args.calib_mode
 
 def train():
     """Training function."""
@@ -440,6 +476,69 @@ def set_new_lr(step_num, batch_id):
 
     net.save_parameters(os.path.join(output_dir, 'net.params'))
 
+def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
+    """calibration function on the dev dataset."""
+    log.info('Loading dev data...')
+    if version_2:
+        dev_data = SQuAD('dev', version='2.0')
+    else:
+        dev_data = SQuAD('dev', version='1.1')
+    if args.debug:
+        sampled_data = [dev_data[0], dev_data[1], dev_data[2]]
+        dev_data = mx.gluon.data.SimpleDataset(sampled_data)
+    log.info('Number of records in dev data:{}'.format(len(dev_data)))
+
+    batchify_fn_calib = nlp.data.batchify.Tuple(
+        nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
+        nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
+        nlp.data.batchify.Stack('float32'),
+        nlp.data.batchify.Stack('float32'))
+
+    dev_dataset = dev_data.transform(
+        SQuADTransform(
+            copy.copy(tokenizer),
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+            max_query_length=max_query_length,
+            is_pad=False,
+            is_training=False)._transform, lazy=False)
+
+    dev_data_transform, _ = preprocess_dataset(
+        dev_data, SQuADTransform(
+            copy.copy(tokenizer),
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+            max_query_length=max_query_length,
+            is_pad=False,
+            is_training=False),
+            for_calibration=only_calibration)
+    log.info('The number of examples after preprocessing:{}'.format(
+        len(dev_data_transform)))
+
+    dev_dataloader = mx.gluon.data.DataLoader(
+        dev_data_transform,
+        batchify_fn=batchify_fn_calib,
+        num_workers=4, batch_size=test_batch_size,
+        shuffle=False, last_batch='keep')
+
+    assert ctx == mx.cpu(), \
+        "Currently only supports CPU with MKL-DNN backend."
+    log.info('Now we are doing calibration on dev with %s.', ctx)
+    collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=log)
+    net = quantize_net(net, quantized_dtype=quantized_dtype,
+                        exclude_layers=[],
+                        exclude_layers_match=['elemwise_add'],
+                        calib_data=dev_dataloader,
+                        calib_mode=calib_mode,
+                        num_calib_examples=test_batch_size * num_calib_batches,
+                        ctx=ctx,
+                        LayerOutputCollector=collector,
+                        logger=log)
+    # save params
+    ckpt_name = 'model_bert_squad_quantized_{0}'.format(calib_mode)
+    params_saved = os.path.join(output_dir, ckpt_name)
+    net.export(params_saved, epoch=0)
+    log.info('Saving quantized model at %s' % output_dir)
 
 def evaluate():
     """Evaluate the model on validation dataset.
@@ -537,8 +636,13 @@ def evaluate():
 
 
 if __name__ == '__main__':
-    if not only_predict:
+    if only_calibration:
+        calibration(net,
+                    num_calib_batches,
+                    quantized_dtype,
+                    calib_mode)
+    elif not only_predict:
         train()
         evaluate()
-    elif model_parameters:
+    elif model_parameters or deploy:
         evaluate()

From 9ef98b918448460cccda9d1710d832927be1f327 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Mon, 18 Nov 2019 15:03:41 +0800
Subject: [PATCH 03/29] add headers

---
 src/gluonnlp/calibration/__init__.py  | 19 ++++++++++++++++
 src/gluonnlp/calibration/collector.py | 31 +++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/src/gluonnlp/calibration/__init__.py b/src/gluonnlp/calibration/__init__.py
index 056b65d1c9..0d140419f5 100644
--- a/src/gluonnlp/calibration/__init__.py
+++ b/src/gluonnlp/calibration/__init__.py
@@ -1,2 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=wildcard-import
+"""This module includes calibration utilities such as layer collector."""
 from . import collector
 from .collector import *
\ No newline at end of file
diff --git a/src/gluonnlp/calibration/collector.py b/src/gluonnlp/calibration/collector.py
index ef3769170a..3e72cbb2d8 100644
--- a/src/gluonnlp/calibration/collector.py
+++ b/src/gluonnlp/calibration/collector.py
@@ -1,4 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint:disable=redefined-outer-name,logging-format-interpolation
+"""
+Bert layer output collector with threshold clipping for calibration
+===================================================================
 
+This collector is designed for collect and clip the layer outputs of bert 
+while calibration with Gluon NLP Toolkit.
+
+@article{devlin2018bert,
+  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
+  author={Devlin, Jacob and Chang, Ming- \
+      Wei and Lee, Kenton and Toutanova, Kristina},
+  journal={arXiv preprint arXiv:1810.04805},
+  year={2018}
+}
+"""
 import ctypes
 import numpy as np
 from mxnet import ndarray

From ee2126ca2c12441720f83a3708a67210de817855 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Mon, 18 Nov 2019 17:13:02 +0800
Subject: [PATCH 04/29] enhance quantization and add readme

---
 scripts/bert/finetune_classifier.py |   9 ++-
 scripts/bert/finetune_squad.py      |  29 ++++---
 scripts/bert/quantization.md        | 112 ++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+), 15 deletions(-)
 create mode 100644 scripts/bert/quantization.md

diff --git a/scripts/bert/finetune_classifier.py b/scripts/bert/finetune_classifier.py
index 3babe3bcbd..674ba1db98 100644
--- a/scripts/bert/finetune_classifier.py
+++ b/scripts/bert/finetune_classifier.py
@@ -296,14 +296,15 @@
     nlp.utils.load_parameters(model, model_parameters, ctx=ctx, cast_dtype=True)
 nlp.utils.mkdir(output_dir)
 
+logging.debug(model)
+model.hybridize(static_alloc=True)
+loss_function.hybridize(static_alloc=True)
+
 if deploy:
     logging.info('load symbol file directly as SymbolBlock for model deployment')
     model = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
             ['data0', 'data1', 'data2'], '{}-0000.params'.format(args.model_prefix))
-
-logging.debug(model)
-model.hybridize(static_alloc=True)
-loss_function.hybridize(static_alloc=True)
+    model.hybridize(static_alloc=True, static_shape=True)
 
 # data processing
 do_lower_case = 'uncased' in dataset
diff --git a/scripts/bert/finetune_squad.py b/scripts/bert/finetune_squad.py
index 08c64510ec..0b0a9c3c1c 100644
--- a/scripts/bert/finetune_squad.py
+++ b/scripts/bert/finetune_squad.py
@@ -50,7 +50,7 @@
 from gluonnlp.data import SQuAD
 from gluonnlp.calibration import BertLayerCollector
 from model.qa import BertForQALoss, BertForQA
-from data.qa import SQuADTransform, preprocess_dataset, preprocess_calib_dataset
+from data.qa import SQuADTransform, preprocess_dataset
 from bert_qa_evaluate import get_F1_EM, predict, PredResult
 
 np.random.seed(6)
@@ -150,6 +150,11 @@
                     'Sequences longer than this will be truncated, and sequences shorter '
                     'than this will be padded. default is 384')
 
+parser.add_argument(
+    '--pad',
+    action='store_true',
+    help='Whether to pad to maximum length when preparing data batches. Default is False.')
+
 parser.add_argument('--doc_stride',
                     type=int,
                     default=128,
@@ -208,7 +213,7 @@
 parser.add_argument('--only_calibration', action='store_true',
                     help='quantize model')
 
-parser.add_argument('--num_calib_batches', type=int, default=5,
+parser.add_argument('--num_calib_batches', type=int, default=10,
                     help='number of batches for calibration')
 
 parser.add_argument('--quantized_dtype', type=str, default='auto', 
@@ -267,6 +272,7 @@
 null_score_diff_threshold = args.null_score_diff_threshold
 
 max_seq_length = args.max_seq_length
+pad = args.pad
 doc_stride = args.doc_stride
 max_query_length = args.max_query_length
 n_best_size = args.n_best_size
@@ -331,16 +337,17 @@
     # no checkpoint is loaded
     net.initialize(init=mx.init.Normal(0.02), ctx=ctx)
 
-if deploy:
-    logging.info('load symbol file directly as SymbolBlock for model deployment')
-    net = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
-            ['data0', 'data1', 'data2'], '{}-0000.params'.format(args.model_prefix))
-
 net.hybridize(static_alloc=True)
 
 loss_function = BertForQALoss()
 loss_function.hybridize(static_alloc=True)
 
+if deploy:
+    logging.info('load symbol file directly as SymbolBlock for model deployment')
+    net = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
+            ['data0', 'data1', 'data2'], '{}-0000.params'.format(args.model_prefix))
+    net.hybridize(static_alloc=True, static_shape=True)
+
 # calibration config
 only_calibration = args.only_calibration
 num_calib_batches = args.num_calib_batches
@@ -500,7 +507,7 @@ def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
             max_seq_length=max_seq_length,
             doc_stride=doc_stride,
             max_query_length=max_query_length,
-            is_pad=False,
+            is_pad=pad,
             is_training=False)._transform, lazy=False)
 
     dev_data_transform, _ = preprocess_dataset(
@@ -509,7 +516,7 @@ def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
             max_seq_length=max_seq_length,
             doc_stride=doc_stride,
             max_query_length=max_query_length,
-            is_pad=False,
+            is_pad=pad,
             is_training=False),
             for_calibration=only_calibration)
     log.info('The number of examples after preprocessing:{}'.format(
@@ -559,7 +566,7 @@ def evaluate():
             max_seq_length=max_seq_length,
             doc_stride=doc_stride,
             max_query_length=max_query_length,
-            is_pad=False,
+            is_pad=pad,
             is_training=False)._transform, lazy=False)
 
     dev_data_transform, _ = preprocess_dataset(
@@ -568,7 +575,7 @@ def evaluate():
             max_seq_length=max_seq_length,
             doc_stride=doc_stride,
             max_query_length=max_query_length,
-            is_pad=False,
+            is_pad=pad,
             is_training=False))
     log.info('The number of examples after preprocessing:{}'.format(
         len(dev_data_transform)))
diff --git a/scripts/bert/quantization.md b/scripts/bert/quantization.md
new file mode 100644
index 0000000000..63aee7c8a3
--- /dev/null
+++ b/scripts/bert/quantization.md
@@ -0,0 +1,112 @@
+# Bert Quantization for MRPC and SQuAD
+
+Tested on EC2 c5.12xlarge.
+
+1. Install MXNet and GluonNLP
+
+```bash
+pip install mxnet-mkl --pre [--user]
+pip install gluonnlp --pre [--user]
+```
+
+2. Clone BERT scripts to local
+
+BERT scripts are provided in the GluonNLP repository.
+
+```bash
+git clone https://github.com/dmlc/gluon-nlp.git gluon-nlp
+cd gluon-nlp
+cd scripts/bert
+```
+
+## Sentence Classification (MRPC)
+
+1. Fine-tune the MRPC task
+
+```bash
+python finetune_classifier.py --task_name MRPC --batch_size 32 --optimizer bertadam --epochs 3 --lr 2e-5
+```
+
+2. Run calibration
+
+Use 1 core on 1 socket for calibration.
+
+```bash
+export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
+export OMP_NUM_THREADS=1
+numactl --physcpubind=0 --membind=0 python finetune_classifier.py --task_name MRPC --epochs 1 --only_calibration --model_parameters ./output_dir/model_bert_MRPC_2.params --pad
+```
+
+`model_bert_MRPC_quantized_customize-symbol.json` and `model_bert_MRPC_quantized_customize-0000.params` will be saved in `output_dir`.
+
+3. Run inference for latency
+
+Use 4 cores on 1 socket for latency measurement.
+
+```bash
+export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
+export OMP_NUM_THREADS=4
+# float32
+numactl --physcpubind=0-3 --membind=0 python finetune_classifier.py --task_name MRPC --epochs 1 --only_inference --model_parameters ./output_dir/model_bert_MRPC_2.params --dev_batch_size 1 --pad
+# int8
+numactl --physcpubind=0-3 --membind=0 python finetune_classifier.py --task_name MRPC --epochs 1 --only_inference --deploy --model_prefix ./output_dir/model_bert_MRPC_quantized_customize --dev_batch_size 1 --pad
+```
+
+4. Run inference for throughput
+
+Use full cores on 1 socket for throughput measurement. Change `--dev_batch_size` to any batch size you want. The dev dataset of MRPC only has 408 sentence pairs.
+
+```bash
+export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
+export OMP_NUM_THREADS=24
+# float32
+numactl --physcpubind=0-23 --membind=0 python finetune_classifier.py --task_name MRPC --epochs 1 --only_inference --model_parameters ./output_dir/model_bert_MRPC_2.params --dev_batch_size 32 --pad
+# int8
+numactl --physcpubind=0-23 --membind=0 python finetune_classifier.py --task_name MRPC --epochs 1 --only_inference --deploy --model_prefix ./output_dir/model_bert_MRPC_quantized_customize --dev_batch_size 32 --pad
+```
+
+## Question answering
+
+1. Fine-tune the SQuAD task
+
+```bash
+python finetune_squad.py --optimizer adam --batch_size 12 --lr 3e-5 --epochs 2
+```
+
+2. Run calibration
+
+Use 1 core on 1 socket for calibration.
+
+```bash
+export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
+export OMP_NUM_THREADS=1
+numactl --physcpubind=0 --membind=0 python finetune_squad.py --only_calibration --model_parameters output_dir/net.params --pad
+```
+
+`model_bert_squad_quantized_customize-symbol.json` and `model_bert_squad_quantized_customize-0000.params` will be saved in `output_dir`.
+
+3. Run inference for latency
+
+Use 4 cores on 1 socket for latency measurement.
+
+```bash
+export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
+export OMP_NUM_THREADS=4
+# float32
+numactl --physcpubind=0-3 --membind=0 python finetune_squad.py --only_predict --model_parameters output_dir/net.params --test_batch_size 1 --pad
+# int8
+numactl --physcpubind=0-3 --membind=0 python finetune_squad.py --only_predict --deploy --model_prefix output_dir/model_bert_squad_quantized_customize --test_batch_size 1 --pad
+```
+
+4. Run inference for throughput
+
+Use full cores on 1 socket for throughput measurement. Change `--test_batch_size` to any batch size you want.
+
+```bash
+export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
+export OMP_NUM_THREADS=24
+# float32
+numactl --physcpubind=0-23 --membind=0 python finetune_squad.py --only_predict --model_parameters output_dir/net.params --test_batch_size 24 --pad
+# int8
+numactl --physcpubind=0-23 --membind=0 python finetune_squad.py --only_predict --deploy --model_prefix output_dir/model_bert_squad_quantized_customize --test_batch_size 24 --pad
+```
\ No newline at end of file

From f69f0875b680e96a4e78f7addfc6c4572bcfe6ac Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Tue, 19 Nov 2019 13:27:18 +0800
Subject: [PATCH 05/29] rename layernorm

---
 src/gluonnlp/calibration/collector.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gluonnlp/calibration/collector.py b/src/gluonnlp/calibration/collector.py
index 3e72cbb2d8..d4df907883 100644
--- a/src/gluonnlp/calibration/collector.py
+++ b/src/gluonnlp/calibration/collector.py
@@ -61,7 +61,7 @@ def collect(self, name, arr):
         max_range = ndarray.max(arr).asscalar()
         if name.find('gelu0__mul0') != -1 and max_range > self.clip_max:
             max_range = self.clip_max
-        if name.find('bertlayernorm0_layernorm0') != -1 and min_range < self.clip_min:
+        if name.find('layernorm0_layernorm0') != -1 and min_range < self.clip_min:
             min_range = self.clip_min
         if name in self.min_max_dict:
             cur_min_max = self.min_max_dict[name]
@@ -71,4 +71,4 @@ def collect(self, name, arr):
             self.min_max_dict[name] = (min_range, max_range)
         if self.logger is not None:
             self.logger.info("Collecting layer %s min_range=%f, max_range=%f"
-                             % (name, min_range, max_range))
\ No newline at end of file
+                             % (name, min_range, max_range))

From d1654813d0270c058429355e122fe80a373082e0 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Mon, 18 Nov 2019 13:18:34 +0800
Subject: [PATCH 06/29] init support for int8 bert classification

---
 scripts/bert/data/transform.py        |  5 +-
 scripts/bert/finetune_classifier.py   | 80 ++++++++++++++++++++++++---
 src/gluonnlp/calibration/__init__.py  |  2 +
 src/gluonnlp/calibration/collector.py | 43 ++++++++++++++
 4 files changed, 120 insertions(+), 10 deletions(-)
 create mode 100644 src/gluonnlp/calibration/__init__.py
 create mode 100644 src/gluonnlp/calibration/collector.py

diff --git a/scripts/bert/data/transform.py b/scripts/bert/data/transform.py
index be8cabc6b4..d8bef6efc2 100644
--- a/scripts/bert/data/transform.py
+++ b/scripts/bert/data/transform.py
@@ -124,6 +124,7 @@ def __call__(self, line):
             if self.class_labels:
                 label = self._label_map[label]
             label = np.array([label], dtype=self._label_dtype)
-            return input_ids, valid_length, segment_ids, label
+            return input_ids, segment_ids, valid_length, label
         else:
-            return self._bert_xform(line)
+            input_ids, valid_length, segment_ids = self._bert_xform(line)
+            return input_ids, segment_ids, valid_length
diff --git a/scripts/bert/finetune_classifier.py b/scripts/bert/finetune_classifier.py
index 6cc4b25be1..b4036513d8 100644
--- a/scripts/bert/finetune_classifier.py
+++ b/scripts/bert/finetune_classifier.py
@@ -43,9 +43,11 @@
 import mxnet as mx
 from mxnet import gluon
 from mxnet.contrib.amp import amp
+from mxnet.contrib.quantization import *
 import gluonnlp as nlp
 from gluonnlp.data import BERTTokenizer
 from gluonnlp.model import BERTClassifier, RoBERTaClassifier
+from gluonnlp.calibration import BertLayerCollector
 
 from data.classification import MRPCTask, QQPTask, RTETask, STSBTask, SSTTask
 from data.classification import QNLITask, CoLATask, MNLITask, WNLITask, XNLITask
@@ -178,6 +180,20 @@
     default=None,
     help='Whether to perform early stopping based on the metric on dev set. '
          'The provided value is the patience. ')
+parser.add_argument('--deploy', action='store_true',
+                    help='whether load static model for deployment')
+parser.add_argument('--model_prefix', type=str, required=False,
+                    help='load static model as hybridblock.')
+parser.add_argument('--only_calibration', action='store_true',
+                    help='quantize model')
+parser.add_argument('--num_calib_batches', type=int, default=5,
+                    help='number of batches for calibration')
+parser.add_argument('--quantized_dtype', type=str, default='auto', 
+                    choices=['auto', 'int8', 'uint8'],
+                    help='quantization destination data type for input data')
+parser.add_argument('--calib_mode', type=str, default='customize',
+                    choices=['none', 'naive', 'entropy', 'customize'],
+                    help='calibration mode used for generating calibration table for the quantized symbol.')
 
 args = parser.parse_args()
 
@@ -215,6 +231,11 @@
 dataset = args.bert_dataset
 pretrained_bert_parameters = args.pretrained_bert_parameters
 model_parameters = args.model_parameters
+
+# load symbolic model
+deploy = args.deploy
+model_prefix = args.model_prefix
+
 if only_inference and not model_parameters:
     warnings.warn('model_parameters is not set. '
                   'Randomly initialized model will be used for inference.')
@@ -268,6 +289,11 @@
     nlp.utils.load_parameters(model, model_parameters, ctx=ctx, cast_dtype=True)
 nlp.utils.mkdir(output_dir)
 
+if deploy:
+    logging.info('load symbol file directly as SymbolBlock for model deployment')
+    model = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
+            ['data0', 'data1', 'data2'], '{}-0000.params'.format(args.model_prefix))
+
 logging.debug(model)
 model.hybridize(static_alloc=True)
 loss_function.hybridize(static_alloc=True)
@@ -279,6 +305,12 @@
 else:
     bert_tokenizer = BERTTokenizer(vocabulary, lower=do_lower_case)
 
+# calibration config
+only_calibration = args.only_calibration
+num_calib_batches = args.num_calib_batches
+quantized_dtype = args.quantized_dtype
+calib_mode = args.calib_mode
+
 def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab, pad=False):
     """Train/eval Data preparation function."""
     # transformation for data train and dev
@@ -295,13 +327,13 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab,
     train_tsv = task.dataset_train()[1]
     data_train = mx.gluon.data.SimpleDataset(list(map(trans, train_tsv)))
     data_train_len = data_train.transform(
-        lambda input_id, length, segment_id, label_id: length, lazy=False)
+        lambda input_id, segment_id, length, label_id: length, lazy=False)
     # bucket sampler for training
     pad_val = vocabulary[vocabulary.padding_token]
     batchify_fn = nlp.data.batchify.Tuple(
         nlp.data.batchify.Pad(axis=0, pad_val=pad_val), # input
-        nlp.data.batchify.Stack(),                      # length
         nlp.data.batchify.Pad(axis=0, pad_val=0),       # segment
+        nlp.data.batchify.Stack(),                      # length
         nlp.data.batchify.Stack(label_dtype))           # label
     batch_sampler = nlp.data.sampler.FixedBucketSampler(
         data_train_len,
@@ -332,8 +364,9 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab,
 
     # batchify for data test
     test_batchify_fn = nlp.data.batchify.Tuple(
-        nlp.data.batchify.Pad(axis=0, pad_val=pad_val), nlp.data.batchify.Stack(),
-        nlp.data.batchify.Pad(axis=0, pad_val=0))
+        nlp.data.batchify.Pad(axis=0, pad_val=pad_val),
+        nlp.data.batchify.Pad(axis=0, pad_val=0),
+        nlp.data.batchify.Stack())
     # transform for data test
     test_trans = BERTDatasetTransform(tokenizer, max_len,
                                       vocab=vocab,
@@ -362,6 +395,30 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab,
 train_data, dev_data_list, test_data_list, num_train_examples = preprocess_data(
     bert_tokenizer, task, batch_size, dev_batch_size, args.max_len, vocabulary, args.pad)
 
+def calibration(net, dev_data_list, num_calib_batches, quantized_dtype, calib_mode):
+    """calibration function on the dev dataset."""
+    assert len(dev_data_list) == 1, \
+        "Currectly, MNLI not supported."
+    assert ctx == mx.cpu(), \
+        "Currently only supports CPU with MKL-DNN backend."
+    logging.info('Now we are doing calibration on dev with %s.', ctx)
+    for _, dev_data in dev_data_list:
+        collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=logging)
+        net = quantize_net(net, quantized_dtype=quantized_dtype,
+                           exclude_layers=[],
+                           exclude_layers_match=['elemwise_add'],
+                           calib_data=dev_data,
+                           calib_mode=calib_mode,
+                           num_calib_examples=dev_batch_size * num_calib_batches,
+                           ctx=ctx,
+                           LayerOutputCollector=collector,
+                           logger=logging)
+        # save params
+        ckpt_name = 'model_bert_{0}_quantized_{1}'.format(task_name, calib_mode)
+        params_saved = os.path.join(output_dir, ckpt_name)
+        net.export(params_saved, epoch=0)
+        logging.info('Saving quantized model at %s' % output_dir)
+
 
 def test(loader_test, segment):
     """Inference function on the test dataset."""
@@ -370,7 +427,7 @@ def test(loader_test, segment):
     tic = time.time()
     results = []
     for _, seqs in enumerate(loader_test):
-        input_ids, valid_length, segment_ids = seqs
+        input_ids, segment_ids, valid_length = seqs
         input_ids = input_ids.as_in_context(ctx)
         valid_length = valid_length.as_in_context(ctx).astype('float32')
         if use_roberta:
@@ -483,7 +540,7 @@ def train(metric):
 
                 # forward and backward
                 with mx.autograd.record():
-                    input_ids, valid_length, segment_ids, label = seqs
+                    input_ids, segment_ids, valid_length, label = seqs
                     input_ids = input_ids.as_in_context(ctx)
                     valid_length = valid_length.as_in_context(ctx).astype('float32')
                     label = label.as_in_context(ctx)
@@ -563,7 +620,7 @@ def evaluate(loader_dev, metric, segment):
     label_list = []
     out_list = []
     for batch_id, seqs in enumerate(loader_dev):
-        input_ids, valid_length, segment_ids, label = seqs
+        input_ids, segment_ids, valid_length, label = seqs
         input_ids = input_ids.as_in_context(ctx)
         valid_length = valid_length.as_in_context(ctx).astype('float32')
         label = label.as_in_context(ctx)
@@ -598,4 +655,11 @@ def evaluate(loader_dev, metric, segment):
 
 
 if __name__ == '__main__':
-    train(task.metrics)
+    if only_calibration:
+        calibration(model,
+                    dev_data_list,
+                    num_calib_batches,
+                    quantized_dtype,
+                    calib_mode)
+    else:
+        train(task.metrics)
diff --git a/src/gluonnlp/calibration/__init__.py b/src/gluonnlp/calibration/__init__.py
new file mode 100644
index 0000000000..056b65d1c9
--- /dev/null
+++ b/src/gluonnlp/calibration/__init__.py
@@ -0,0 +1,2 @@
+from . import collector
+from .collector import *
\ No newline at end of file
diff --git a/src/gluonnlp/calibration/collector.py b/src/gluonnlp/calibration/collector.py
new file mode 100644
index 0000000000..ef3769170a
--- /dev/null
+++ b/src/gluonnlp/calibration/collector.py
@@ -0,0 +1,43 @@
+
+import ctypes
+import numpy as np
+from mxnet import ndarray
+from mxnet.base import NDArrayHandle, py_str
+from mxnet.ndarray import NDArray
+
+class BertLayerCollector(object):
+    """Saves layer output min and max values in a dict with layer names as keys.
+    The collected min and max values will be directly used as thresholds for quantization.
+    """
+    def __init__(self, clip_min=None, clip_max=None, logger=None):
+        self.include_layer = lambda name: name.endswith('_output') or \
+                                        name.endswith('reshape10_0') or \
+                                        name.endswith('_mul0_0') or \
+                                        name.endswith('_squeeze0_0')
+        self.min_max_dict = {}
+        self.clip_min = clip_min
+        self.clip_max = clip_max
+        self.logger = logger
+
+    def collect(self, name, arr):
+        """Callback function for collecting min and max values from an NDArray."""
+        name = py_str(name)
+        if self.include_layer is not None and not self.include_layer(name):
+            return
+        handle = ctypes.cast(arr, NDArrayHandle)
+        arr = NDArray(handle, writable=False)
+        min_range = ndarray.min(arr).asscalar()
+        max_range = ndarray.max(arr).asscalar()
+        if name.find('gelu0__mul0') != -1 and max_range > self.clip_max:
+            max_range = self.clip_max
+        if name.find('bertlayernorm0_layernorm0') != -1 and min_range < self.clip_min:
+            min_range = self.clip_min
+        if name in self.min_max_dict:
+            cur_min_max = self.min_max_dict[name]
+            self.min_max_dict[name] = (min(cur_min_max[0], min_range),
+                                       max(cur_min_max[1], max_range))
+        else:
+            self.min_max_dict[name] = (min_range, max_range)
+        if self.logger is not None:
+            self.logger.info("Collecting layer %s min_range=%f, max_range=%f"
+                             % (name, min_range, max_range))
\ No newline at end of file

From c178f25a88775c04cd112571add796d4870a9d82 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Mon, 18 Nov 2019 14:56:00 +0800
Subject: [PATCH 07/29] support squad calibration

---
 scripts/bert/data/qa.py        |  12 +++-
 scripts/bert/finetune_squad.py | 110 ++++++++++++++++++++++++++++++++-
 2 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/scripts/bert/data/qa.py b/scripts/bert/data/qa.py
index e6ef2294b4..44dc1d9c9d 100644
--- a/scripts/bert/data/qa.py
+++ b/scripts/bert/data/qa.py
@@ -52,7 +52,7 @@ def _worker_fn(example, transform):
     return feature
 
 
-def preprocess_dataset(dataset, transform, num_workers=8):
+def preprocess_dataset(dataset, transform, num_workers=8, for_calibration=False):
     """Use multiprocessing to perform transform for dataset.
 
     Parameters
@@ -77,8 +77,14 @@ def preprocess_dataset(dataset, transform, num_workers=8):
                 dataset_transform.append(_data[:-1])
                 dataset_len.append(_data[-1])
 
-    dataset = SimpleDataset(dataset_transform).transform(
-        lambda x: (x[0], x[1], x[2], x[3], x[4], x[5]))
+    if for_calibration:
+        # gluon calibration api supposes there must be input datas and one label per data entry.
+        dataset = SimpleDataset(dataset_transform).transform(
+            lambda x: (x[1], x[2], x[3], x[4]))
+    else:
+        dataset = SimpleDataset(dataset_transform).transform(
+            lambda x: (x[0], x[1], x[2], x[3], x[4], x[5]))
+
     end = time.time()
     pool.close()
     print('Done! Transform dataset costs %.2f seconds.' % (end-start))
diff --git a/scripts/bert/finetune_squad.py b/scripts/bert/finetune_squad.py
index 45e5a2da67..c9cd1e88c1 100644
--- a/scripts/bert/finetune_squad.py
+++ b/scripts/bert/finetune_squad.py
@@ -44,11 +44,13 @@
 
 import numpy as np
 import mxnet as mx
+from mxnet.contrib.quantization import *
 
 import gluonnlp as nlp
 from gluonnlp.data import SQuAD
+from gluonnlp.calibration import BertLayerCollector
 from model.qa import BertForQALoss, BertForQA
-from data.qa import SQuADTransform, preprocess_dataset
+from data.qa import SQuADTransform, preprocess_dataset, preprocess_calib_dataset
 from bert_qa_evaluate import get_F1_EM, predict, PredResult
 
 np.random.seed(6)
@@ -197,6 +199,26 @@
                     action='store_true',
                     help='Run the example in test mode for sanity checks')
 
+parser.add_argument('--deploy', action='store_true',
+                    help='whether load static model for deployment')
+
+parser.add_argument('--model_prefix', type=str, required=False,
+                    help='load static model as hybridblock.')
+
+parser.add_argument('--only_calibration', action='store_true',
+                    help='quantize model')
+
+parser.add_argument('--num_calib_batches', type=int, default=5,
+                    help='number of batches for calibration')
+
+parser.add_argument('--quantized_dtype', type=str, default='auto', 
+                    choices=['auto', 'int8', 'uint8'],
+                    help='quantization destination data type for input data')
+
+parser.add_argument('--calib_mode', type=str, default='customize',
+                    choices=['none', 'naive', 'entropy', 'customize'],
+                    help='calibration mode used for generating calibration table for the quantized symbol.')
+
 args = parser.parse_args()
 
 output_dir = args.output_dir
@@ -289,6 +311,10 @@
     nlp.data.batchify.Stack('float32'),
     nlp.data.batchify.Stack('float32'))
 
+# load symbolic model
+deploy = args.deploy
+model_prefix = args.model_prefix
+
 net = BertForQA(bert=bert)
 if model_parameters:
     # load complete BertForQA parameters
@@ -305,11 +331,21 @@
     # no checkpoint is loaded
     net.initialize(init=mx.init.Normal(0.02), ctx=ctx)
 
+if deploy:
+    logging.info('load symbol file directly as SymbolBlock for model deployment')
+    net = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
+            ['data0', 'data1', 'data2'], '{}-0000.params'.format(args.model_prefix))
+
 net.hybridize(static_alloc=True)
 
 loss_function = BertForQALoss()
 loss_function.hybridize(static_alloc=True)
 
+# calibration config
+only_calibration = args.only_calibration
+num_calib_batches = args.num_calib_batches
+quantized_dtype = args.quantized_dtype
+calib_mode = args.calib_mode
 
 def train():
     """Training function."""
@@ -433,6 +469,69 @@ def set_new_lr(step_num, batch_id):
 
     net.save_parameters(os.path.join(output_dir, 'net.params'))
 
+def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
+    """calibration function on the dev dataset."""
+    log.info('Loading dev data...')
+    if version_2:
+        dev_data = SQuAD('dev', version='2.0')
+    else:
+        dev_data = SQuAD('dev', version='1.1')
+    if args.debug:
+        sampled_data = [dev_data[0], dev_data[1], dev_data[2]]
+        dev_data = mx.gluon.data.SimpleDataset(sampled_data)
+    log.info('Number of records in dev data:{}'.format(len(dev_data)))
+
+    batchify_fn_calib = nlp.data.batchify.Tuple(
+        nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
+        nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),
+        nlp.data.batchify.Stack('float32'),
+        nlp.data.batchify.Stack('float32'))
+
+    dev_dataset = dev_data.transform(
+        SQuADTransform(
+            copy.copy(tokenizer),
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+            max_query_length=max_query_length,
+            is_pad=False,
+            is_training=False)._transform, lazy=False)
+
+    dev_data_transform, _ = preprocess_dataset(
+        dev_data, SQuADTransform(
+            copy.copy(tokenizer),
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+            max_query_length=max_query_length,
+            is_pad=False,
+            is_training=False),
+            for_calibration=only_calibration)
+    log.info('The number of examples after preprocessing:{}'.format(
+        len(dev_data_transform)))
+
+    dev_dataloader = mx.gluon.data.DataLoader(
+        dev_data_transform,
+        batchify_fn=batchify_fn_calib,
+        num_workers=4, batch_size=test_batch_size,
+        shuffle=False, last_batch='keep')
+
+    assert ctx == mx.cpu(), \
+        "Currently only supports CPU with MKL-DNN backend."
+    log.info('Now we are doing calibration on dev with %s.', ctx)
+    collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=log)
+    net = quantize_net(net, quantized_dtype=quantized_dtype,
+                        exclude_layers=[],
+                        exclude_layers_match=['elemwise_add'],
+                        calib_data=dev_dataloader,
+                        calib_mode=calib_mode,
+                        num_calib_examples=test_batch_size * num_calib_batches,
+                        ctx=ctx,
+                        LayerOutputCollector=collector,
+                        logger=log)
+    # save params
+    ckpt_name = 'model_bert_squad_quantized_{0}'.format(calib_mode)
+    params_saved = os.path.join(output_dir, ckpt_name)
+    net.export(params_saved, epoch=0)
+    log.info('Saving quantized model at %s' % output_dir)
 
 def evaluate():
     """Evaluate the model on validation dataset.
@@ -530,8 +629,13 @@ def evaluate():
 
 
 if __name__ == '__main__':
-    if not only_predict:
+    if only_calibration:
+        calibration(net,
+                    num_calib_batches,
+                    quantized_dtype,
+                    calib_mode)
+    elif not only_predict:
         train()
         evaluate()
-    elif model_parameters:
+    elif model_parameters or deploy:
         evaluate()

From 96a0b806c873f1cece8ed73bf3739f400af982b1 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Mon, 18 Nov 2019 15:03:41 +0800
Subject: [PATCH 08/29] add headers

---
 src/gluonnlp/calibration/__init__.py  | 19 ++++++++++++++++
 src/gluonnlp/calibration/collector.py | 31 +++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/src/gluonnlp/calibration/__init__.py b/src/gluonnlp/calibration/__init__.py
index 056b65d1c9..0d140419f5 100644
--- a/src/gluonnlp/calibration/__init__.py
+++ b/src/gluonnlp/calibration/__init__.py
@@ -1,2 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=wildcard-import
+"""This module includes calibration utilities such as layer collector."""
 from . import collector
 from .collector import *
\ No newline at end of file
diff --git a/src/gluonnlp/calibration/collector.py b/src/gluonnlp/calibration/collector.py
index ef3769170a..3e72cbb2d8 100644
--- a/src/gluonnlp/calibration/collector.py
+++ b/src/gluonnlp/calibration/collector.py
@@ -1,4 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint:disable=redefined-outer-name,logging-format-interpolation
+"""
+Bert layer output collector with threshold clipping for calibration
+===================================================================
 
+This collector is designed for collect and clip the layer outputs of bert 
+while calibration with Gluon NLP Toolkit.
+
+@article{devlin2018bert,
+  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
+  author={Devlin, Jacob and Chang, Ming- \
+      Wei and Lee, Kenton and Toutanova, Kristina},
+  journal={arXiv preprint arXiv:1810.04805},
+  year={2018}
+}
+"""
 import ctypes
 import numpy as np
 from mxnet import ndarray

From e7191fff84e3d87a8434b6d79267a5540abad71a Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Mon, 18 Nov 2019 17:13:02 +0800
Subject: [PATCH 09/29] enhance quantization and add readme

---
 scripts/bert/finetune_classifier.py |   9 ++-
 scripts/bert/finetune_squad.py      |  29 ++++---
 scripts/bert/quantization.md        | 112 ++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+), 15 deletions(-)
 create mode 100644 scripts/bert/quantization.md

diff --git a/scripts/bert/finetune_classifier.py b/scripts/bert/finetune_classifier.py
index b4036513d8..0b9e75b70f 100644
--- a/scripts/bert/finetune_classifier.py
+++ b/scripts/bert/finetune_classifier.py
@@ -289,14 +289,15 @@
     nlp.utils.load_parameters(model, model_parameters, ctx=ctx, cast_dtype=True)
 nlp.utils.mkdir(output_dir)
 
+logging.debug(model)
+model.hybridize(static_alloc=True)
+loss_function.hybridize(static_alloc=True)
+
 if deploy:
     logging.info('load symbol file directly as SymbolBlock for model deployment')
     model = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
             ['data0', 'data1', 'data2'], '{}-0000.params'.format(args.model_prefix))
-
-logging.debug(model)
-model.hybridize(static_alloc=True)
-loss_function.hybridize(static_alloc=True)
+    model.hybridize(static_alloc=True, static_shape=True)
 
 # data processing
 do_lower_case = 'uncased' in dataset
diff --git a/scripts/bert/finetune_squad.py b/scripts/bert/finetune_squad.py
index c9cd1e88c1..d2bf6c19c5 100644
--- a/scripts/bert/finetune_squad.py
+++ b/scripts/bert/finetune_squad.py
@@ -50,7 +50,7 @@
 from gluonnlp.data import SQuAD
 from gluonnlp.calibration import BertLayerCollector
 from model.qa import BertForQALoss, BertForQA
-from data.qa import SQuADTransform, preprocess_dataset, preprocess_calib_dataset
+from data.qa import SQuADTransform, preprocess_dataset
 from bert_qa_evaluate import get_F1_EM, predict, PredResult
 
 np.random.seed(6)
@@ -150,6 +150,11 @@
                     'Sequences longer than this will be truncated, and sequences shorter '
                     'than this will be padded. default is 384')
 
+parser.add_argument(
+    '--pad',
+    action='store_true',
+    help='Whether to pad to maximum length when preparing data batches. Default is False.')
+
 parser.add_argument('--doc_stride',
                     type=int,
                     default=128,
@@ -208,7 +213,7 @@
 parser.add_argument('--only_calibration', action='store_true',
                     help='quantize model')
 
-parser.add_argument('--num_calib_batches', type=int, default=5,
+parser.add_argument('--num_calib_batches', type=int, default=10,
                     help='number of batches for calibration')
 
 parser.add_argument('--quantized_dtype', type=str, default='auto', 
@@ -267,6 +272,7 @@
 null_score_diff_threshold = args.null_score_diff_threshold
 
 max_seq_length = args.max_seq_length
+pad = args.pad
 doc_stride = args.doc_stride
 max_query_length = args.max_query_length
 n_best_size = args.n_best_size
@@ -331,16 +337,17 @@
     # no checkpoint is loaded
     net.initialize(init=mx.init.Normal(0.02), ctx=ctx)
 
-if deploy:
-    logging.info('load symbol file directly as SymbolBlock for model deployment')
-    net = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
-            ['data0', 'data1', 'data2'], '{}-0000.params'.format(args.model_prefix))
-
 net.hybridize(static_alloc=True)
 
 loss_function = BertForQALoss()
 loss_function.hybridize(static_alloc=True)
 
+if deploy:
+    logging.info('load symbol file directly as SymbolBlock for model deployment')
+    net = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
+            ['data0', 'data1', 'data2'], '{}-0000.params'.format(args.model_prefix))
+    net.hybridize(static_alloc=True, static_shape=True)
+
 # calibration config
 only_calibration = args.only_calibration
 num_calib_batches = args.num_calib_batches
@@ -493,7 +500,7 @@ def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
             max_seq_length=max_seq_length,
             doc_stride=doc_stride,
             max_query_length=max_query_length,
-            is_pad=False,
+            is_pad=pad,
             is_training=False)._transform, lazy=False)
 
     dev_data_transform, _ = preprocess_dataset(
@@ -502,7 +509,7 @@ def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
             max_seq_length=max_seq_length,
             doc_stride=doc_stride,
             max_query_length=max_query_length,
-            is_pad=False,
+            is_pad=pad,
             is_training=False),
             for_calibration=only_calibration)
     log.info('The number of examples after preprocessing:{}'.format(
@@ -552,7 +559,7 @@ def evaluate():
             max_seq_length=max_seq_length,
             doc_stride=doc_stride,
             max_query_length=max_query_length,
-            is_pad=False,
+            is_pad=pad,
             is_training=False)._transform, lazy=False)
 
     dev_data_transform, _ = preprocess_dataset(
@@ -561,7 +568,7 @@ def evaluate():
             max_seq_length=max_seq_length,
             doc_stride=doc_stride,
             max_query_length=max_query_length,
-            is_pad=False,
+            is_pad=pad,
             is_training=False))
     log.info('The number of examples after preprocessing:{}'.format(
         len(dev_data_transform)))
diff --git a/scripts/bert/quantization.md b/scripts/bert/quantization.md
new file mode 100644
index 0000000000..63aee7c8a3
--- /dev/null
+++ b/scripts/bert/quantization.md
@@ -0,0 +1,112 @@
+# Bert Quantization for MRPC and SQuAD
+
+Tested on EC2 c5.12xlarge.
+
+1. Install MXNet and GluonNLP
+
+```bash
+pip install mxnet-mkl --pre [--user]
+pip install gluonnlp --pre [--user]
+```
+
+2. Clone BERT scripts to local
+
+BERT scripts are provided in the GluonNLP repository.
+
+```bash
+git clone https://github.com/dmlc/gluon-nlp.git gluon-nlp
+cd gluon-nlp
+cd scripts/bert
+```
+
+## Sentence Classification (MRPC)
+
+1. Fine-tune the MRPC task
+
+```bash
+python finetune_classifier.py --task_name MRPC --batch_size 32 --optimizer bertadam --epochs 3 --lr 2e-5
+```
+
+2. Run calibration
+
+Use 1 core on 1 socket for calibration.
+
+```bash
+export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
+export OMP_NUM_THREADS=1
+numactl --physcpubind=0 --membind=0 python finetune_classifier.py --task_name MRPC --epochs 1 --only_calibration --model_parameters ./output_dir/model_bert_MRPC_2.params --pad
+```
+
+`model_bert_MRPC_quantized_customize-symbol.json` and `model_bert_MRPC_quantized_customize-0000.params` will be saved in `output_dir`.
+
+3. Run inference for latency
+
+Use 4 cores on 1 socket for latency measurement.
+
+```bash
+export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
+export OMP_NUM_THREADS=4
+# float32
+numactl --physcpubind=0-3 --membind=0 python finetune_classifier.py --task_name MRPC --epochs 1 --only_inference --model_parameters ./output_dir/model_bert_MRPC_2.params --dev_batch_size 1 --pad
+# int8
+numactl --physcpubind=0-3 --membind=0 python finetune_classifier.py --task_name MRPC --epochs 1 --only_inference --deploy --model_prefix ./output_dir/model_bert_MRPC_quantized_customize --dev_batch_size 1 --pad
+```
+
+4. Run inference for throughput
+
+Use full cores on 1 socket for throughput measurement. Change `--dev_batch_size` to any batch size you want. The dev dataset of MRPC only has 408 sentence pairs.
+
+```bash
+export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
+export OMP_NUM_THREADS=24
+# float32
+numactl --physcpubind=0-23 --membind=0 python finetune_classifier.py --task_name MRPC --epochs 1 --only_inference --model_parameters ./output_dir/model_bert_MRPC_2.params --dev_batch_size 32 --pad
+# int8
+numactl --physcpubind=0-23 --membind=0 python finetune_classifier.py --task_name MRPC --epochs 1 --only_inference --deploy --model_prefix ./output_dir/model_bert_MRPC_quantized_customize --dev_batch_size 32 --pad
+```
+
+## Question answering
+
+1. Fine-tune the SQuAD task
+
+```bash
+python finetune_squad.py --optimizer adam --batch_size 12 --lr 3e-5 --epochs 2
+```
+
+2. Run calibration
+
+Use 1 core on 1 socket for calibration.
+
+```bash
+export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
+export OMP_NUM_THREADS=1
+numactl --physcpubind=0 --membind=0 python finetune_squad.py --only_calibration --model_parameters output_dir/net.params --pad
+```
+
+`model_bert_squad_quantized_customize-symbol.json` and `model_bert_squad_quantized_customize-0000.params` will be saved in `output_dir`.
+
+3. Run inference for latency
+
+Use 4 cores on 1 socket for latency measurement.
+
+```bash
+export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
+export OMP_NUM_THREADS=4
+# float32
+numactl --physcpubind=0-3 --membind=0 python finetune_squad.py --only_predict --model_parameters output_dir/net.params --test_batch_size 1 --pad
+# int8
+numactl --physcpubind=0-3 --membind=0 python finetune_squad.py --only_predict --deploy --model_prefix output_dir/model_bert_squad_quantized_customize --test_batch_size 1 --pad
+```
+
+4. Run inference for throughput
+
+Use full cores on 1 socket for throughput measurement. Change `--test_batch_size` to any batch size you want.
+
+```bash
+export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
+export OMP_NUM_THREADS=24
+# float32
+numactl --physcpubind=0-23 --membind=0 python finetune_squad.py --only_predict --model_parameters output_dir/net.params --test_batch_size 24 --pad
+# int8
+numactl --physcpubind=0-23 --membind=0 python finetune_squad.py --only_predict --deploy --model_prefix output_dir/model_bert_squad_quantized_customize --test_batch_size 24 --pad
+```
\ No newline at end of file

From 9b4505da00c7344d27f6847593bea6e8c6a426fa Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Tue, 19 Nov 2019 13:27:18 +0800
Subject: [PATCH 10/29] rename layernorm

---
 src/gluonnlp/calibration/collector.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gluonnlp/calibration/collector.py b/src/gluonnlp/calibration/collector.py
index 3e72cbb2d8..d4df907883 100644
--- a/src/gluonnlp/calibration/collector.py
+++ b/src/gluonnlp/calibration/collector.py
@@ -61,7 +61,7 @@ def collect(self, name, arr):
         max_range = ndarray.max(arr).asscalar()
         if name.find('gelu0__mul0') != -1 and max_range > self.clip_max:
             max_range = self.clip_max
-        if name.find('bertlayernorm0_layernorm0') != -1 and min_range < self.clip_min:
+        if name.find('layernorm0_layernorm0') != -1 and min_range < self.clip_min:
             min_range = self.clip_min
         if name in self.min_max_dict:
             cur_min_max = self.min_max_dict[name]
@@ -71,4 +71,4 @@ def collect(self, name, arr):
             self.min_max_dict[name] = (min_range, max_range)
         if self.logger is not None:
             self.logger.info("Collecting layer %s min_range=%f, max_range=%f"
-                             % (name, min_range, max_range))
\ No newline at end of file
+                             % (name, min_range, max_range))

From 08859e356078065dfb72a76e1d500e0077da62f9 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Tue, 24 Dec 2019 12:39:17 +0800
Subject: [PATCH 11/29] fix lint

---
 src/gluonnlp/calibration/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gluonnlp/calibration/__init__.py b/src/gluonnlp/calibration/__init__.py
index 0d140419f5..5959ce84fd 100644
--- a/src/gluonnlp/calibration/__init__.py
+++ b/src/gluonnlp/calibration/__init__.py
@@ -18,4 +18,5 @@
 # pylint: disable=wildcard-import
 """This module includes calibration utilities such as layer collector."""
 from . import collector
-from .collector import *
\ No newline at end of file
+from .collector import *
+

From 8f9f531cfc19dfa03af3c0349500271d428fb404 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Tue, 24 Dec 2019 13:31:05 +0800
Subject: [PATCH 12/29] fix lint

---
 src/gluonnlp/calibration/__init__.py  | 1 -
 src/gluonnlp/calibration/collector.py | 7 +++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/gluonnlp/calibration/__init__.py b/src/gluonnlp/calibration/__init__.py
index 5959ce84fd..a4b677b712 100644
--- a/src/gluonnlp/calibration/__init__.py
+++ b/src/gluonnlp/calibration/__init__.py
@@ -19,4 +19,3 @@
 """This module includes calibration utilities such as layer collector."""
 from . import collector
 from .collector import *
-
diff --git a/src/gluonnlp/calibration/collector.py b/src/gluonnlp/calibration/collector.py
index d4df907883..4cfd15eb02 100644
--- a/src/gluonnlp/calibration/collector.py
+++ b/src/gluonnlp/calibration/collector.py
@@ -19,7 +19,7 @@
 Bert layer output collector with threshold clipping for calibration
 ===================================================================
 
-This collector is designed for collect and clip the layer outputs of bert 
+This collector is designed for collect and clip the layer outputs of bert
 while calibration with Gluon NLP Toolkit.
 
 @article{devlin2018bert,
@@ -31,12 +31,11 @@
 }
 """
 import ctypes
-import numpy as np
 from mxnet import ndarray
 from mxnet.base import NDArrayHandle, py_str
 from mxnet.ndarray import NDArray
 
-class BertLayerCollector(object):
+class BertLayerCollector:
     """Saves layer output min and max values in a dict with layer names as keys.
     The collected min and max values will be directly used as thresholds for quantization.
     """
@@ -70,5 +69,5 @@ def collect(self, name, arr):
         else:
             self.min_max_dict[name] = (min_range, max_range)
         if self.logger is not None:
-            self.logger.info("Collecting layer %s min_range=%f, max_range=%f"
+            self.logger.info('Collecting layer %s min_range=%f, max_range=%f'
                              % (name, min_range, max_range))

From 4ccc979a58dfcf838baa220ff9178c18d212613f Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Wed, 25 Dec 2019 14:24:43 +0800
Subject: [PATCH 13/29] fix ut and lint

---
 docs/examples/sentence_embedding/bert.md     |  6 ++--
 scripts/bert/finetune_classifier.py          | 14 +++++----
 scripts/bert/finetune_squad.py               | 30 +++++++++++---------
 scripts/tests/test_bert_dataset_transform.py |  4 +--
 4 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/docs/examples/sentence_embedding/bert.md b/docs/examples/sentence_embedding/bert.md
index cbd0cdf98d..ef7071f6fe 100644
--- a/docs/examples/sentence_embedding/bert.md
+++ b/docs/examples/sentence_embedding/bert.md
@@ -223,8 +223,8 @@ print('%s token id = %s'%(vocabulary.padding_token, vocabulary[vocabulary.paddin
 print('%s token id = %s'%(vocabulary.cls_token, vocabulary[vocabulary.cls_token]))
 print('%s token id = %s'%(vocabulary.sep_token, vocabulary[vocabulary.sep_token]))
 print('token ids = \n%s'%data_train[sample_id][0])
-print('valid length = \n%s'%data_train[sample_id][1])
-print('segment ids = \n%s'%data_train[sample_id][2])
+print('segment ids = \n%s'%data_train[sample_id][1])
+print('valid length = \n%s'%data_train[sample_id][2])
 print('label = \n%s'%data_train[sample_id][3])
 ```
 
@@ -261,7 +261,7 @@ num_epochs = 3
 for epoch_id in range(num_epochs):
     metric.reset()
     step_loss = 0
-    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(bert_dataloader):
+    for batch_id, (token_ids, segment_ids, valid_length, label) in enumerate(bert_dataloader):
         with mx.autograd.record():
 
             # Load the data to the GPU
diff --git a/scripts/bert/finetune_classifier.py b/scripts/bert/finetune_classifier.py
index 0b9e75b70f..17690ff74a 100644
--- a/scripts/bert/finetune_classifier.py
+++ b/scripts/bert/finetune_classifier.py
@@ -43,7 +43,7 @@
 import mxnet as mx
 from mxnet import gluon
 from mxnet.contrib.amp import amp
-from mxnet.contrib.quantization import *
+from mxnet.contrib.quantization import quantize_net
 import gluonnlp as nlp
 from gluonnlp.data import BERTTokenizer
 from gluonnlp.model import BERTClassifier, RoBERTaClassifier
@@ -188,12 +188,13 @@
                     help='quantize model')
 parser.add_argument('--num_calib_batches', type=int, default=5,
                     help='number of batches for calibration')
-parser.add_argument('--quantized_dtype', type=str, default='auto', 
+parser.add_argument('--quantized_dtype', type=str, default='auto',
                     choices=['auto', 'int8', 'uint8'],
                     help='quantization destination data type for input data')
 parser.add_argument('--calib_mode', type=str, default='customize',
                     choices=['none', 'naive', 'entropy', 'customize'],
-                    help='calibration mode used for generating calibration table for the quantized symbol.')
+                    help='calibration mode used for generating calibration table '
+                         'for the quantized symbol.')
 
 args = parser.parse_args()
 
@@ -296,7 +297,8 @@
 if deploy:
     logging.info('load symbol file directly as SymbolBlock for model deployment')
     model = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
-            ['data0', 'data1', 'data2'], '{}-0000.params'.format(args.model_prefix))
+                                         ['data0', 'data1', 'data2'],
+                                         '{}-0000.params'.format(args.model_prefix))
     model.hybridize(static_alloc=True, static_shape=True)
 
 # data processing
@@ -399,9 +401,9 @@ def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab,
 def calibration(net, dev_data_list, num_calib_batches, quantized_dtype, calib_mode):
     """calibration function on the dev dataset."""
     assert len(dev_data_list) == 1, \
-        "Currectly, MNLI not supported."
+        'Currectly, MNLI not supported.'
     assert ctx == mx.cpu(), \
-        "Currently only supports CPU with MKL-DNN backend."
+        'Currently only supports CPU with MKL-DNN backend.'
     logging.info('Now we are doing calibration on dev with %s.', ctx)
     for _, dev_data in dev_data_list:
         collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=logging)
diff --git a/scripts/bert/finetune_squad.py b/scripts/bert/finetune_squad.py
index d2bf6c19c5..c98fcc1f37 100644
--- a/scripts/bert/finetune_squad.py
+++ b/scripts/bert/finetune_squad.py
@@ -44,7 +44,7 @@
 
 import numpy as np
 import mxnet as mx
-from mxnet.contrib.quantization import *
+from mxnet.contrib.quantization import quantize_net
 
 import gluonnlp as nlp
 from gluonnlp.data import SQuAD
@@ -216,13 +216,14 @@
 parser.add_argument('--num_calib_batches', type=int, default=10,
                     help='number of batches for calibration')
 
-parser.add_argument('--quantized_dtype', type=str, default='auto', 
+parser.add_argument('--quantized_dtype', type=str, default='auto',
                     choices=['auto', 'int8', 'uint8'],
                     help='quantization destination data type for input data')
 
 parser.add_argument('--calib_mode', type=str, default='customize',
                     choices=['none', 'naive', 'entropy', 'customize'],
-                    help='calibration mode used for generating calibration table for the quantized symbol.')
+                    help='calibration mode used for generating calibration table '
+                         'for the quantized symbol.')
 
 args = parser.parse_args()
 
@@ -345,7 +346,8 @@
 if deploy:
     logging.info('load symbol file directly as SymbolBlock for model deployment')
     net = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
-            ['data0', 'data1', 'data2'], '{}-0000.params'.format(args.model_prefix))
+                                       ['data0', 'data1', 'data2'],
+                                       '{}-0000.params'.format(args.model_prefix))
     net.hybridize(static_alloc=True, static_shape=True)
 
 # calibration config
@@ -511,7 +513,7 @@ def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
             max_query_length=max_query_length,
             is_pad=pad,
             is_training=False),
-            for_calibration=only_calibration)
+        for_calibration=only_calibration)
     log.info('The number of examples after preprocessing:{}'.format(
         len(dev_data_transform)))
 
@@ -522,18 +524,18 @@ def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
         shuffle=False, last_batch='keep')
 
     assert ctx == mx.cpu(), \
-        "Currently only supports CPU with MKL-DNN backend."
+        'Currently only supports CPU with MKL-DNN backend.'
     log.info('Now we are doing calibration on dev with %s.', ctx)
     collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=log)
     net = quantize_net(net, quantized_dtype=quantized_dtype,
-                        exclude_layers=[],
-                        exclude_layers_match=['elemwise_add'],
-                        calib_data=dev_dataloader,
-                        calib_mode=calib_mode,
-                        num_calib_examples=test_batch_size * num_calib_batches,
-                        ctx=ctx,
-                        LayerOutputCollector=collector,
-                        logger=log)
+                       exclude_layers=[],
+                       exclude_layers_match=['elemwise_add'],
+                       calib_data=dev_dataloader,
+                       calib_mode=calib_mode,
+                       num_calib_examples=test_batch_size * num_calib_batches,
+                       ctx=ctx,
+                       LayerOutputCollector=collector,
+                       logger=log)
     # save params
     ckpt_name = 'model_bert_squad_quantized_{0}'.format(calib_mode)
     params_saved = os.path.join(output_dir, ckpt_name)
diff --git a/scripts/tests/test_bert_dataset_transform.py b/scripts/tests/test_bert_dataset_transform.py
index 57bcc2ec1e..3a9080154c 100644
--- a/scripts/tests/test_bert_dataset_transform.py
+++ b/scripts/tests/test_bert_dataset_transform.py
@@ -37,7 +37,7 @@ def test_bert_dataset_transform():
     bert_cls_dataset_t = BERTDatasetTransform(tokenizer, 15,
                                               class_labels=[label_cls], pad=True,
                                               pair=True)
-    token_ids, length, type_ids, label_ids = bert_cls_dataset_t((text_a, text_b, label_cls))
+    token_ids, type_ids, length, label_ids = bert_cls_dataset_t((text_a, text_b, label_cls))
 
     text_a_tokens = ['is', 'this', 'jack', '##son', '##ville', '?']
     text_b_tokens = ['no', 'it', 'is', 'not']
@@ -65,7 +65,7 @@ def test_bert_dataset_transform():
     # test BERTDatasetTransform for regression task
     label_reg = 0.2
     bert_reg_dataset_t = BERTDatasetTransform(tokenizer, 15, pad=True, pair=True)
-    token_ids, length, type_ids, label_reg_val = bert_reg_dataset_t((text_a, text_b, label_reg))
+    token_ids, type_ids, length, label_reg_val = bert_reg_dataset_t((text_a, text_b, label_reg))
     assert all(token_ids == valid_token_ids)
     assert length == len(vocab_tokens) + 3
     assert all(type_ids == valid_type_ids)

From 412ff7333e28b3dc784f8793c4e72a3c4b71137a Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Wed, 25 Dec 2019 15:22:34 +0800
Subject: [PATCH 14/29] fix ut

---
 docs/examples/sentence_embedding/bert.md |  2 +-
 scripts/bert/finetune_classifier.py      |  2 +-
 scripts/bert/finetune_squad.py           | 11 +----------
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/docs/examples/sentence_embedding/bert.md b/docs/examples/sentence_embedding/bert.md
index ef7071f6fe..675f9300f0 100644
--- a/docs/examples/sentence_embedding/bert.md
+++ b/docs/examples/sentence_embedding/bert.md
@@ -241,7 +241,7 @@ batch_size = 32
 lr = 5e-6
 
 # The FixedBucketSampler and the DataLoader for making the mini-batches
-train_sampler = nlp.data.FixedBucketSampler(lengths=[int(item[1]) for item in data_train],
+train_sampler = nlp.data.FixedBucketSampler(lengths=[int(item[2]) for item in data_train],
                                             batch_size=batch_size,
                                             shuffle=True)
 bert_dataloader = mx.gluon.data.DataLoader(data_train, batch_sampler=train_sampler)
diff --git a/scripts/bert/finetune_classifier.py b/scripts/bert/finetune_classifier.py
index 17690ff74a..885d195145 100644
--- a/scripts/bert/finetune_classifier.py
+++ b/scripts/bert/finetune_classifier.py
@@ -420,7 +420,7 @@ def calibration(net, dev_data_list, num_calib_batches, quantized_dtype, calib_mo
         ckpt_name = 'model_bert_{0}_quantized_{1}'.format(task_name, calib_mode)
         params_saved = os.path.join(output_dir, ckpt_name)
         net.export(params_saved, epoch=0)
-        logging.info('Saving quantized model at %s' % output_dir)
+        logging.info('Saving quantized model at %s', output_dir)
 
 
 def test(loader_test, segment):
diff --git a/scripts/bert/finetune_squad.py b/scripts/bert/finetune_squad.py
index c98fcc1f37..b4c6854c86 100644
--- a/scripts/bert/finetune_squad.py
+++ b/scripts/bert/finetune_squad.py
@@ -496,15 +496,6 @@ def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
         nlp.data.batchify.Stack('float32'),
         nlp.data.batchify.Stack('float32'))
 
-    dev_dataset = dev_data.transform(
-        SQuADTransform(
-            copy.copy(tokenizer),
-            max_seq_length=max_seq_length,
-            doc_stride=doc_stride,
-            max_query_length=max_query_length,
-            is_pad=pad,
-            is_training=False)._transform, lazy=False)
-
     dev_data_transform, _ = preprocess_dataset(
         dev_data, SQuADTransform(
             copy.copy(tokenizer),
@@ -540,7 +531,7 @@ def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
     ckpt_name = 'model_bert_squad_quantized_{0}'.format(calib_mode)
     params_saved = os.path.join(output_dir, ckpt_name)
     net.export(params_saved, epoch=0)
-    log.info('Saving quantized model at %s' % output_dir)
+    log.info('Saving quantized model at %s', output_dir)
 
 def evaluate():
     """Evaluate the model on validation dataset.

From 53ea62e6ee0239038325ac01ef4425383630eb63 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Mon, 30 Dec 2019 09:53:01 +0800
Subject: [PATCH 15/29] update python env

---
 env/cpu/py3-master.yml              |  2 +-
 env/docker/py3.yml                  |  2 +-
 env/gpu/py3-master.yml              |  2 +-
 scripts/bert/finetune_classifier.py | 20 ++++++++++----------
 scripts/bert/finetune_squad.py      | 20 ++++++++++----------
 5 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/env/cpu/py3-master.yml b/env/cpu/py3-master.yml
index 6db16d6584..cc96d73811 100644
--- a/env/cpu/py3-master.yml
+++ b/env/cpu/py3-master.yml
@@ -18,7 +18,7 @@ dependencies:
     - flake8==3.7.9
     - mock<3
     - sphinx==2.2.1
-    - https://apache-mxnet.s3-us-west-2.amazonaws.com/dist/2019-12-16/dist/mxnet-1.6.0b20191216-py2.py3-none-manylinux1_x86_64.whl
+    - https://apache-mxnet.s3-us-west-2.amazonaws.com/dist/2019-12-29/dist/mxnet-1.6.0b20191229-py2.py3-none-manylinux1_x86_64.whl
     - scipy==1.3.2
     - regex==2019.11.1 
     - nltk==3.4.5
diff --git a/env/docker/py3.yml b/env/docker/py3.yml
index f3a9fa8e7b..7484ef5106 100644
--- a/env/docker/py3.yml
+++ b/env/docker/py3.yml
@@ -31,7 +31,7 @@ dependencies:
     - flaky==3.6.1
     - flake8==3.7.9
     - mock<3
-    - https://apache-mxnet.s3-us-west-2.amazonaws.com/dist/2019-12-16/dist/mxnet_cu100-1.6.0b20191216-py2.py3-none-manylinux1_x86_64.whl
+    - https://apache-mxnet.s3-us-west-2.amazonaws.com/dist/2019-12-29/dist/mxnet_cu100-1.6.0b20191229-py2.py3-none-manylinux1_x86_64.whl
     - scipy==1.3.2
     - regex==2019.11.1 
     - nltk==3.4.5
diff --git a/env/gpu/py3-master.yml b/env/gpu/py3-master.yml
index c007a335ef..3c86938390 100644
--- a/env/gpu/py3-master.yml
+++ b/env/gpu/py3-master.yml
@@ -31,7 +31,7 @@ dependencies:
     - flaky==3.6.1
     - flake8==3.7.9
     - mock<3
-    - https://apache-mxnet.s3-us-west-2.amazonaws.com/dist/2019-12-16/dist/mxnet_cu100-1.6.0b20191216-py2.py3-none-manylinux1_x86_64.whl
+    - https://apache-mxnet.s3-us-west-2.amazonaws.com/dist/2019-12-29/dist/mxnet_cu100-1.6.0b20191229-py2.py3-none-manylinux1_x86_64.whl
     - scipy==1.3.2
     - regex==2019.11.1 
     - nltk==3.4.5
diff --git a/scripts/bert/finetune_classifier.py b/scripts/bert/finetune_classifier.py
index 885d195145..e797a0dae2 100644
--- a/scripts/bert/finetune_classifier.py
+++ b/scripts/bert/finetune_classifier.py
@@ -43,7 +43,7 @@
 import mxnet as mx
 from mxnet import gluon
 from mxnet.contrib.amp import amp
-from mxnet.contrib.quantization import quantize_net
+from mxnet.contrib.quantization import quantize_net_v2
 import gluonnlp as nlp
 from gluonnlp.data import BERTTokenizer
 from gluonnlp.model import BERTClassifier, RoBERTaClassifier
@@ -407,15 +407,15 @@ def calibration(net, dev_data_list, num_calib_batches, quantized_dtype, calib_mo
     logging.info('Now we are doing calibration on dev with %s.', ctx)
     for _, dev_data in dev_data_list:
         collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=logging)
-        net = quantize_net(net, quantized_dtype=quantized_dtype,
-                           exclude_layers=[],
-                           exclude_layers_match=['elemwise_add'],
-                           calib_data=dev_data,
-                           calib_mode=calib_mode,
-                           num_calib_examples=dev_batch_size * num_calib_batches,
-                           ctx=ctx,
-                           LayerOutputCollector=collector,
-                           logger=logging)
+        net = quantize_net_v2(net, quantized_dtype=quantized_dtype,
+                              exclude_layers=[],
+                              exclude_layers_match=['elemwise_add'],
+                              calib_data=dev_data,
+                              calib_mode=calib_mode,
+                              num_calib_examples=dev_batch_size * num_calib_batches,
+                              ctx=ctx,
+                              LayerOutputCollector=collector,
+                              logger=logging)
         # save params
         ckpt_name = 'model_bert_{0}_quantized_{1}'.format(task_name, calib_mode)
         params_saved = os.path.join(output_dir, ckpt_name)
diff --git a/scripts/bert/finetune_squad.py b/scripts/bert/finetune_squad.py
index b4c6854c86..388cac5c7b 100644
--- a/scripts/bert/finetune_squad.py
+++ b/scripts/bert/finetune_squad.py
@@ -44,7 +44,7 @@
 
 import numpy as np
 import mxnet as mx
-from mxnet.contrib.quantization import quantize_net
+from mxnet.contrib.quantization import quantize_net_v2
 
 import gluonnlp as nlp
 from gluonnlp.data import SQuAD
@@ -518,15 +518,15 @@ def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
         'Currently only supports CPU with MKL-DNN backend.'
     log.info('Now we are doing calibration on dev with %s.', ctx)
     collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=log)
-    net = quantize_net(net, quantized_dtype=quantized_dtype,
-                       exclude_layers=[],
-                       exclude_layers_match=['elemwise_add'],
-                       calib_data=dev_dataloader,
-                       calib_mode=calib_mode,
-                       num_calib_examples=test_batch_size * num_calib_batches,
-                       ctx=ctx,
-                       LayerOutputCollector=collector,
-                       logger=log)
+    net = quantize_net_v2(net, quantized_dtype=quantized_dtype,
+                          exclude_layers=[],
+                          exclude_layers_match=['elemwise_add'],
+                          calib_data=dev_dataloader,
+                          calib_mode=calib_mode,
+                          num_calib_examples=test_batch_size * num_calib_batches,
+                          ctx=ctx,
+                          LayerOutputCollector=collector,
+                          logger=log)
     # save params
     ckpt_name = 'model_bert_squad_quantized_{0}'.format(calib_mode)
     params_saved = os.path.join(output_dir, ckpt_name)

From 720098fdd2686e45ff88de0f668339483b3596ad Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Thu, 16 Jan 2020 14:38:14 +0800
Subject: [PATCH 16/29] add quantization doc to website

---
 docs/examples/index.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/examples/index.rst b/docs/examples/index.rst
index af5a9685c4..c855a5e7d3 100644
--- a/docs/examples/index.rst
+++ b/docs/examples/index.rst
@@ -104,6 +104,13 @@ Sentence Embedding
       See how to use GluonNLP to fine-tune a sentence pair classification model with
       pre-trained BERT parameters.
 
+   .. card::
+      :title: INT8 Quantization for BERT Sentence Classification and Question Answering
+      :link: sentence_embedding/bert/quantization.html
+
+      See how to use GluonNLP to quantize a sentence pair classification or question answering
+      model into int8 data type with Intel DLBoost.
+
 .. toctree::
    :hidden:
    :maxdepth: 1
@@ -111,6 +118,7 @@ Sentence Embedding
    sentence_embedding/elmo_sentence_representation.ipynb
    sentence_embedding/self_attentive_sentence_embedding.ipynb
    sentence_embedding/bert.ipynb
+   sentence_embedding/bert/quantization.md
 
 
 Sentiment Analysis

From a3a8913ec33c345a476320ccf1898cc206bf20b5 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Thu, 16 Jan 2020 15:32:28 +0800
Subject: [PATCH 17/29] move doc

---
 docs/examples/index.rst                                       | 4 ++--
 .../bert => docs/examples/sentence_embedding}/quantization.md | 0
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename {scripts/bert => docs/examples/sentence_embedding}/quantization.md (100%)

diff --git a/docs/examples/index.rst b/docs/examples/index.rst
index c855a5e7d3..45b8de01b0 100644
--- a/docs/examples/index.rst
+++ b/docs/examples/index.rst
@@ -106,7 +106,7 @@ Sentence Embedding
 
    .. card::
       :title: INT8 Quantization for BERT Sentence Classification and Question Answering
-      :link: sentence_embedding/bert/quantization.html
+      :link: sentence_embedding/quantization.html
 
       See how to use GluonNLP to quantize a sentence pair classification or question answering
       model into int8 data type with Intel DLBoost.
@@ -118,7 +118,7 @@ Sentence Embedding
    sentence_embedding/elmo_sentence_representation.ipynb
    sentence_embedding/self_attentive_sentence_embedding.ipynb
    sentence_embedding/bert.ipynb
-   sentence_embedding/bert/quantization.md
+   sentence_embedding/quantization.md
 
 
 Sentiment Analysis
diff --git a/scripts/bert/quantization.md b/docs/examples/sentence_embedding/quantization.md
similarity index 100%
rename from scripts/bert/quantization.md
rename to docs/examples/sentence_embedding/quantization.md

From af44a8ed433ed40d42eda615c9dd3d661408c9c0 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Thu, 16 Jan 2020 16:36:40 +0800
Subject: [PATCH 18/29] pin to 1231 gpu and remove doc temp

---
 docs/examples/index.rst                       |   8 --
 .../sentence_embedding/quantization.md        | 112 ------------------
 env/cpu/py3-master.yml                        |   2 +-
 env/gpu/py3-master.yml                        |   2 +-
 4 files changed, 2 insertions(+), 122 deletions(-)
 delete mode 100644 docs/examples/sentence_embedding/quantization.md

diff --git a/docs/examples/index.rst b/docs/examples/index.rst
index 45b8de01b0..af5a9685c4 100644
--- a/docs/examples/index.rst
+++ b/docs/examples/index.rst
@@ -104,13 +104,6 @@ Sentence Embedding
       See how to use GluonNLP to fine-tune a sentence pair classification model with
       pre-trained BERT parameters.
 
-   .. card::
-      :title: INT8 Quantization for BERT Sentence Classification and Question Answering
-      :link: sentence_embedding/quantization.html
-
-      See how to use GluonNLP to quantize a sentence pair classification or question answering
-      model into int8 data type with Intel DLBoost.
-
 .. toctree::
    :hidden:
    :maxdepth: 1
@@ -118,7 +111,6 @@ Sentence Embedding
    sentence_embedding/elmo_sentence_representation.ipynb
    sentence_embedding/self_attentive_sentence_embedding.ipynb
    sentence_embedding/bert.ipynb
-   sentence_embedding/quantization.md
 
 
 Sentiment Analysis
diff --git a/docs/examples/sentence_embedding/quantization.md b/docs/examples/sentence_embedding/quantization.md
deleted file mode 100644
index 63aee7c8a3..0000000000
--- a/docs/examples/sentence_embedding/quantization.md
+++ /dev/null
@@ -1,112 +0,0 @@
-# Bert Quantization for MRPC and SQuAD
-
-Tested on EC2 c5.12xlarge.
-
-1. Install MXNet and GluonNLP
-
-```bash
-pip install mxnet-mkl --pre [--user]
-pip install gluonnlp --pre [--user]
-```
-
-2. Clone BERT scripts to local
-
-BERT scripts are provided in the GluonNLP repository.
-
-```bash
-git clone https://github.com/dmlc/gluon-nlp.git gluon-nlp
-cd gluon-nlp
-cd scripts/bert
-```
-
-## Sentence Classification (MRPC)
-
-1. Fine-tune the MRPC task
-
-```bash
-python finetune_classifier.py --task_name MRPC --batch_size 32 --optimizer bertadam --epochs 3 --lr 2e-5
-```
-
-2. Run calibration
-
-Use 1 core on 1 socket for calibration.
-
-```bash
-export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
-export OMP_NUM_THREADS=1
-numactl --physcpubind=0 --membind=0 python finetune_classifier.py --task_name MRPC --epochs 1 --only_calibration --model_parameters ./output_dir/model_bert_MRPC_2.params --pad
-```
-
-`model_bert_MRPC_quantized_customize-symbol.json` and `model_bert_MRPC_quantized_customize-0000.params` will be saved in `output_dir`.
-
-3. Run inference for latency
-
-Use 4 cores on 1 socket for latency measurement.
-
-```bash
-export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
-export OMP_NUM_THREADS=4
-# float32
-numactl --physcpubind=0-3 --membind=0 python finetune_classifier.py --task_name MRPC --epochs 1 --only_inference --model_parameters ./output_dir/model_bert_MRPC_2.params --dev_batch_size 1 --pad
-# int8
-numactl --physcpubind=0-3 --membind=0 python finetune_classifier.py --task_name MRPC --epochs 1 --only_inference --deploy --model_prefix ./output_dir/model_bert_MRPC_quantized_customize --dev_batch_size 1 --pad
-```
-
-4. Run inference for throughput
-
-Use full cores on 1 socket for throughput measurement. Change `--dev_batch_size` to any batch size you want. The dev dataset of MRPC only has 408 sentence pairs.
-
-```bash
-export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
-export OMP_NUM_THREADS=24
-# float32
-numactl --physcpubind=0-23 --membind=0 python finetune_classifier.py --task_name MRPC --epochs 1 --only_inference --model_parameters ./output_dir/model_bert_MRPC_2.params --dev_batch_size 32 --pad
-# int8
-numactl --physcpubind=0-23 --membind=0 python finetune_classifier.py --task_name MRPC --epochs 1 --only_inference --deploy --model_prefix ./output_dir/model_bert_MRPC_quantized_customize --dev_batch_size 32 --pad
-```
-
-## Question answering
-
-1. Fine-tune the SQuAD task
-
-```bash
-python finetune_squad.py --optimizer adam --batch_size 12 --lr 3e-5 --epochs 2
-```
-
-2. Run calibration
-
-Use 1 core on 1 socket for calibration.
-
-```bash
-export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
-export OMP_NUM_THREADS=1
-numactl --physcpubind=0 --membind=0 python finetune_squad.py --only_calibration --model_parameters output_dir/net.params --pad
-```
-
-`model_bert_squad_quantized_customize-symbol.json` and `model_bert_squad_quantized_customize-0000.params` will be saved in `output_dir`.
-
-3. Run inference for latency
-
-Use 4 cores on 1 socket for latency measurement.
-
-```bash
-export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
-export OMP_NUM_THREADS=4
-# float32
-numactl --physcpubind=0-3 --membind=0 python finetune_squad.py --only_predict --model_parameters output_dir/net.params --test_batch_size 1 --pad
-# int8
-numactl --physcpubind=0-3 --membind=0 python finetune_squad.py --only_predict --deploy --model_prefix output_dir/model_bert_squad_quantized_customize --test_batch_size 1 --pad
-```
-
-4. Run inference for throughput
-
-Use full cores on 1 socket for throughput measurement. Change `--test_batch_size` to any batch size you want.
-
-```bash
-export KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0
-export OMP_NUM_THREADS=24
-# float32
-numactl --physcpubind=0-23 --membind=0 python finetune_squad.py --only_predict --model_parameters output_dir/net.params --test_batch_size 24 --pad
-# int8
-numactl --physcpubind=0-23 --membind=0 python finetune_squad.py --only_predict --deploy --model_prefix output_dir/model_bert_squad_quantized_customize --test_batch_size 24 --pad
-```
\ No newline at end of file
diff --git a/env/cpu/py3-master.yml b/env/cpu/py3-master.yml
index 0bf270d2bf..2f2a8c824d 100644
--- a/env/cpu/py3-master.yml
+++ b/env/cpu/py3-master.yml
@@ -34,7 +34,7 @@ dependencies:
     - flake8==3.7.9
     - mock<3
     - sphinx==2.2.1
-    - https://apache-mxnet.s3-us-west-2.amazonaws.com/dist/2019-12-29/dist/mxnet-1.6.0b20191229-py2.py3-none-manylinux1_x86_64.whl
+    - https://lllausen-data.s3.amazonaws.com/mxnet_cu100-1.6.0b20191231-py2.py3-none-manylinux1_x86_64.whl
     - scipy==1.3.2
     - regex==2019.11.1 
     - nltk==3.4.5
diff --git a/env/gpu/py3-master.yml b/env/gpu/py3-master.yml
index 80c34ebfc2..f319fa7e91 100644
--- a/env/gpu/py3-master.yml
+++ b/env/gpu/py3-master.yml
@@ -33,7 +33,7 @@ dependencies:
     - flaky==3.6.1
     - flake8==3.7.9
     - mock<3
-    - https://apache-mxnet.s3-us-west-2.amazonaws.com/dist/2019-12-29/dist/mxnet_cu100-1.6.0b20191229-py2.py3-none-manylinux1_x86_64.whl
+    - https://lllausen-data.s3.amazonaws.com/mxnet_cu100-1.6.0b20191231-py2.py3-none-manylinux1_x86_64.whl
     - scipy==1.3.2
     - regex==2019.11.1 
     - nltk==3.4.5

From bd2ecc5b927cbebd68493e6119227603524bd45b Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Thu, 16 Jan 2020 17:57:20 +0800
Subject: [PATCH 19/29] pin to official 20200115 mxnet

---
 env/cpu/py3-master.yml | 2 +-
 env/cpu/py3.yml        | 2 +-
 env/docker/py3.yml     | 2 +-
 env/gpu/py3-master.yml | 2 +-
 env/gpu/py3.yml        | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/env/cpu/py3-master.yml b/env/cpu/py3-master.yml
index 2f2a8c824d..c6d9344c80 100644
--- a/env/cpu/py3-master.yml
+++ b/env/cpu/py3-master.yml
@@ -34,7 +34,7 @@ dependencies:
     - flake8==3.7.9
     - mock<3
     - sphinx==2.2.1
-    - https://lllausen-data.s3.amazonaws.com/mxnet_cu100-1.6.0b20191231-py2.py3-none-manylinux1_x86_64.whl
+    - https://apache-mxnet.s3-us-west-2.amazonaws.com/dist/2020-01-15/dist/mxnet-1.6.0b20200115-py2.py3-none-manylinux1_x86_64.whl
     - scipy==1.3.2
     - regex==2019.11.1 
     - nltk==3.4.5
diff --git a/env/cpu/py3.yml b/env/cpu/py3.yml
index 87c40c63c2..af2604479f 100644
--- a/env/cpu/py3.yml
+++ b/env/cpu/py3.yml
@@ -32,7 +32,7 @@ dependencies:
     - flaky==3.6.1
     - flake8==3.7.9
     - mock<3
-    - https://lllausen-data.s3.amazonaws.com/mxnet_cu100-1.6.0b20191231-py2.py3-none-manylinux1_x86_64.whl
+    - https://apache-mxnet.s3-us-west-2.amazonaws.com/dist/2020-01-15/dist/mxnet-1.6.0b20200115-py2.py3-none-manylinux1_x86_64.whl
     - scipy==1.3.2
     - regex==2019.11.1
     - nltk==3.4.5
diff --git a/env/docker/py3.yml b/env/docker/py3.yml
index 12db5c5fe8..81fad5ae2c 100644
--- a/env/docker/py3.yml
+++ b/env/docker/py3.yml
@@ -32,7 +32,7 @@ dependencies:
     - flaky==3.6.1
     - flake8==3.7.9
     - mock<3
-    - https://lllausen-data.s3.amazonaws.com/mxnet_cu100-1.6.0b20191231-py2.py3-none-manylinux1_x86_64.whl
+    - https://apache-mxnet.s3-us-west-2.amazonaws.com/dist/2020-01-15/dist/mxnet_cu100-1.6.0b20200115-py2.py3-none-manylinux1_x86_64.whl
     - scipy==1.3.2
     - regex==2019.11.1 
     - nltk==3.4.5
diff --git a/env/gpu/py3-master.yml b/env/gpu/py3-master.yml
index f319fa7e91..1657d3520f 100644
--- a/env/gpu/py3-master.yml
+++ b/env/gpu/py3-master.yml
@@ -33,7 +33,7 @@ dependencies:
     - flaky==3.6.1
     - flake8==3.7.9
     - mock<3
-    - https://lllausen-data.s3.amazonaws.com/mxnet_cu100-1.6.0b20191231-py2.py3-none-manylinux1_x86_64.whl
+    - https://apache-mxnet.s3-us-west-2.amazonaws.com/dist/2020-01-15/dist/mxnet_cu100-1.6.0b20200115-py2.py3-none-manylinux1_x86_64.whl
     - scipy==1.3.2
     - regex==2019.11.1 
     - nltk==3.4.5
diff --git a/env/gpu/py3.yml b/env/gpu/py3.yml
index 87c40c63c2..0dce159019 100644
--- a/env/gpu/py3.yml
+++ b/env/gpu/py3.yml
@@ -32,7 +32,7 @@ dependencies:
     - flaky==3.6.1
     - flake8==3.7.9
     - mock<3
-    - https://lllausen-data.s3.amazonaws.com/mxnet_cu100-1.6.0b20191231-py2.py3-none-manylinux1_x86_64.whl
+    - https://apache-mxnet.s3-us-west-2.amazonaws.com/dist/2020-01-15/dist/mxnet_cu100-1.6.0b20200115-py2.py3-none-manylinux1_x86_64.whl
     - scipy==1.3.2
     - regex==2019.11.1
     - nltk==3.4.5

From 11b9dfdde20140705cbfaa9d4ecfbcbed30234eb Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Sun, 19 Jan 2020 15:52:18 +0800
Subject: [PATCH 20/29] enable channel-wise quantization and smart mode

---
 scripts/bert/finetune_classifier.py   | 3 ++-
 scripts/bert/finetune_squad.py        | 3 ++-
 src/gluonnlp/calibration/collector.py | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/bert/finetune_classifier.py b/scripts/bert/finetune_classifier.py
index e797a0dae2..6dedf68f6f 100644
--- a/scripts/bert/finetune_classifier.py
+++ b/scripts/bert/finetune_classifier.py
@@ -409,7 +409,8 @@ def calibration(net, dev_data_list, num_calib_batches, quantized_dtype, calib_mo
         collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=logging)
         net = quantize_net_v2(net, quantized_dtype=quantized_dtype,
                               exclude_layers=[],
-                              exclude_layers_match=['elemwise_add'],
+                              quantize_mode='smart',
+                              quantize_granularity='channel-wise',
                               calib_data=dev_data,
                               calib_mode=calib_mode,
                               num_calib_examples=dev_batch_size * num_calib_batches,
diff --git a/scripts/bert/finetune_squad.py b/scripts/bert/finetune_squad.py
index 388cac5c7b..42c5c67dc6 100644
--- a/scripts/bert/finetune_squad.py
+++ b/scripts/bert/finetune_squad.py
@@ -520,7 +520,8 @@ def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
     collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=log)
     net = quantize_net_v2(net, quantized_dtype=quantized_dtype,
                           exclude_layers=[],
-                          exclude_layers_match=['elemwise_add'],
+                          quantize_mode='smart',
+                          quantize_granularity='channel-wise',
                           calib_data=dev_dataloader,
                           calib_mode=calib_mode,
                           num_calib_examples=test_batch_size * num_calib_batches,
diff --git a/src/gluonnlp/calibration/collector.py b/src/gluonnlp/calibration/collector.py
index 4cfd15eb02..fcbd194482 100644
--- a/src/gluonnlp/calibration/collector.py
+++ b/src/gluonnlp/calibration/collector.py
@@ -58,7 +58,7 @@ def collect(self, name, arr):
         arr = NDArray(handle, writable=False)
         min_range = ndarray.min(arr).asscalar()
         max_range = ndarray.max(arr).asscalar()
-        if name.find('gelu0__mul0') != -1 and max_range > self.clip_max:
+        if name.find('gelu0_leakyrelu0') != -1 and max_range > self.clip_max:
             max_range = self.clip_max
         if name.find('layernorm0_layernorm0') != -1 and min_range < self.clip_min:
             min_range = self.clip_min

From 69b01aa54d7a89341049ea3820ff3bc2d2ff0ab5 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Thu, 23 Jan 2020 16:49:39 +0800
Subject: [PATCH 21/29] fix ut

---
 scripts/bert/finetune_classifier.py | 24 ++++++++++++------------
 scripts/bert/finetune_squad.py      | 24 ++++++++++++------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/scripts/bert/finetune_classifier.py b/scripts/bert/finetune_classifier.py
index ed2942369b..5c37ace5b7 100644
--- a/scripts/bert/finetune_classifier.py
+++ b/scripts/bert/finetune_classifier.py
@@ -435,16 +435,17 @@ def calibration(net, dev_data_list, num_calib_batches, quantized_dtype, calib_mo
     logging.info('Now we are doing calibration on dev with %s.', ctx)
     for _, dev_data in dev_data_list:
         collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=logging)
-        net = quantize_net_v2(net, quantized_dtype=quantized_dtype,
-                              exclude_layers=[],
-                              quantize_mode='smart',
-                              quantize_granularity='channel-wise',
-                              calib_data=dev_data,
-                              calib_mode=calib_mode,
-                              num_calib_examples=dev_batch_size * num_calib_batches,
-                              ctx=ctx,
-                              LayerOutputCollector=collector,
-                              logger=logging)
+        num_calib_examples=dev_batch_size * num_calib_batches,
+        net = mx.contrib.quantization.quantize_net_v2(net, quantized_dtype=quantized_dtype,
+                                                      exclude_layers=[],
+                                                      quantize_mode='smart',
+                                                      quantize_granularity='channel-wise',
+                                                      calib_data=dev_data,
+                                                      calib_mode=calib_mode,
+                                                      num_calib_examples=num_calib_examples,
+                                                      ctx=ctx,
+                                                      LayerOutputCollector=collector,
+                                                      logger=logging)
         # save params
         ckpt_name = 'model_bert_{0}_quantized_{1}'.format(task_name, calib_mode)
         params_saved = os.path.join(output_dir, ckpt_name)
@@ -699,13 +700,12 @@ def evaluate(loader_dev, metric, segment):
 if __name__ == '__main__':
     if only_calibration:
         try:
-            from mxnet.contrib.quantization import quantize_net_v2
             calibration(model,
                         dev_data_list,
                         num_calib_batches,
                         quantized_dtype,
                         calib_mode)
-        except:
+        except AttributeError:
             nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx)
             warnings.warn('INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115')
     else:
diff --git a/scripts/bert/finetune_squad.py b/scripts/bert/finetune_squad.py
index 40851fec88..86e21d9ec2 100644
--- a/scripts/bert/finetune_squad.py
+++ b/scripts/bert/finetune_squad.py
@@ -517,16 +517,17 @@ def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
         'Currently only supports CPU with MKL-DNN backend.'
     log.info('Now we are doing calibration on dev with %s.', ctx)
     collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=log)
-    net = quantize_net_v2(net, quantized_dtype=quantized_dtype,
-                          exclude_layers=[],
-                          quantize_mode='smart',
-                          quantize_granularity='channel-wise',
-                          calib_data=dev_dataloader,
-                          calib_mode=calib_mode,
-                          num_calib_examples=test_batch_size * num_calib_batches,
-                          ctx=ctx,
-                          LayerOutputCollector=collector,
-                          logger=log)
+    num_calib_examples=test_batch_size * num_calib_batches,
+    net = mx.contrib.quantization.quantize_net_v2(net, quantized_dtype=quantized_dtype,
+                                                  exclude_layers=[],
+                                                  quantize_mode='smart',
+                                                  quantize_granularity='channel-wise',
+                                                  calib_data=dev_dataloader,
+                                                  calib_mode=calib_mode,
+                                                  num_calib_examples=num_calib_examples,
+                                                  ctx=ctx,
+                                                  LayerOutputCollector=collector,
+                                                  logger=log)
     # save params
     ckpt_name = 'model_bert_squad_quantized_{0}'.format(calib_mode)
     params_saved = os.path.join(output_dir, ckpt_name)
@@ -631,12 +632,11 @@ def evaluate():
 if __name__ == '__main__':
     if only_calibration:
         try:
-            from mxnet.contrib.quantization import quantize_net_v2
             calibration(net,
                         num_calib_batches,
                         quantized_dtype,
                         calib_mode)
-        except:
+        except AttributeError:
             nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx)
             warnings.warn('INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115')
     elif not only_predict:

From 5b500bdb54d19fb9ff8bc1560c918fcedb9099f2 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Thu, 23 Jan 2020 17:29:42 +0800
Subject: [PATCH 22/29] fix ut

---
 scripts/bert/finetune_classifier.py | 2 +-
 scripts/bert/finetune_squad.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/bert/finetune_classifier.py b/scripts/bert/finetune_classifier.py
index 5c37ace5b7..a480d29205 100644
--- a/scripts/bert/finetune_classifier.py
+++ b/scripts/bert/finetune_classifier.py
@@ -435,7 +435,7 @@ def calibration(net, dev_data_list, num_calib_batches, quantized_dtype, calib_mo
     logging.info('Now we are doing calibration on dev with %s.', ctx)
     for _, dev_data in dev_data_list:
         collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=logging)
-        num_calib_examples=dev_batch_size * num_calib_batches,
+        num_calib_examples = dev_batch_size * num_calib_batches
         net = mx.contrib.quantization.quantize_net_v2(net, quantized_dtype=quantized_dtype,
                                                       exclude_layers=[],
                                                       quantize_mode='smart',
diff --git a/scripts/bert/finetune_squad.py b/scripts/bert/finetune_squad.py
index 86e21d9ec2..dc80449244 100644
--- a/scripts/bert/finetune_squad.py
+++ b/scripts/bert/finetune_squad.py
@@ -517,7 +517,7 @@ def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
         'Currently only supports CPU with MKL-DNN backend.'
     log.info('Now we are doing calibration on dev with %s.', ctx)
     collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=log)
-    num_calib_examples=test_batch_size * num_calib_batches,
+    num_calib_examples = test_batch_size * num_calib_batches
     net = mx.contrib.quantization.quantize_net_v2(net, quantized_dtype=quantized_dtype,
                                                   exclude_layers=[],
                                                   quantize_mode='smart',

From 85fc23e3dfc937e0b959e5906796da8bfcce83c3 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Thu, 23 Jan 2020 17:39:15 +0800
Subject: [PATCH 23/29] trigger


From a2fa345c224c155b8efc39ca8aca173240c9521a Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Thu, 23 Jan 2020 18:32:04 +0800
Subject: [PATCH 24/29] trigger


From d400b8574511c1ef0351416de74f64e648adde46 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Mon, 27 Jan 2020 21:18:35 +0800
Subject: [PATCH 25/29] add quantization toturial for mrpc

---
 docs/examples/sentence_embedding/bert.md | 42 ++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/docs/examples/sentence_embedding/bert.md b/docs/examples/sentence_embedding/bert.md
index 675f9300f0..9107a1d6f1 100644
--- a/docs/examples/sentence_embedding/bert.md
+++ b/docs/examples/sentence_embedding/bert.md
@@ -41,6 +41,7 @@ import random
 import numpy as np
 import mxnet as mx
 import gluonnlp as nlp
+from gluonnlp.calibration import BertLayerCollector
 # this notebook assumes that all required scripts are already
 # downloaded from the corresponding tutorial webpage on http://gluon-nlp.mxnet.io
 from bert import data
@@ -294,6 +295,47 @@ for epoch_id in range(num_epochs):
             step_loss = 0
 ```
 
+## Quantize the model
+
+GluonNLP also delivered some int8 quantization methods to improve the performance and reduce the deployment costs for the natural language inference tasks. In real production, there are two main benefits of lower precision (INT8). First, the computation can be accelerated by the low precision instruction, like Intel Vector Neural Network Instruction (VNNI). Second, lower precision data type would save the memory bandwidth and allow for better cache locality and save the power. The new feature can get up to 4X performance speedup in the latest [AWS EC2 C5 instances](https://aws.amazon.com/blogs/aws/now-available-new-c5-instance-sizes-and-bare-metal-instances/) under the [Intel Deep Learning Boost (VNNI)](https://www.intel.ai/intel-deep-learning-boost/) enabled hardware with less than 0.5% accuracy drop.
+
+Now we have a fine-tuned model on MRPC training dataset and in this section, we will quantize the model into int8 data type on a subset of MRPC validation dataset.
+
+```{.python .input}
+# The hyperparameters
+dev_batch_size = 32
+num_calib_batches = 5
+quantized_dtype = 'auto'
+calib_mode = 'customize'
+
+# Calibration function
+def calibration(net, dev_data, num_calib_batches, quantized_dtype, calib_mode):
+    """calibration function on the dev dataset."""
+    print('Now we are doing calibration on dev with %s.', mx.cpu())
+    collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=None)
+    num_calib_examples = dev_batch_size * num_calib_batches
+    quantized_net = mx.contrib.quantization.quantize_net_v2(net, quantized_dtype=quantized_dtype,
+                                                            exclude_layers=[],
+                                                            quantize_mode='smart',
+                                                            quantize_granularity='channel-wise',
+                                                            calib_data=dev_data,
+                                                            calib_mode=calib_mode,
+                                                            num_calib_examples=num_calib_examples,
+                                                            ctx=mx.cpu(),
+                                                            LayerOutputCollector=collector,
+                                                            logger=None)
+
+try:
+    calibration(bert_classifier,
+                bert_dataloader,
+                num_calib_batches,
+                quantized_dtype,
+                calib_mode)
+except AttributeError:
+    nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx)
+    warnings.warn('INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115')
+```
+
 ## Conclusion
 
 In this tutorial, we showed how to fine-tune a sentence pair

From 3b4eee4f0dfce61780b215dc588bf4bd2a95d2da Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Thu, 30 Jan 2020 23:07:05 +0800
Subject: [PATCH 26/29] fix lint

---
 scripts/bert/finetune_squad.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/scripts/bert/finetune_squad.py b/scripts/bert/finetune_squad.py
index 10c6076570..2ff289053d 100644
--- a/scripts/bert/finetune_squad.py
+++ b/scripts/bert/finetune_squad.py
@@ -519,13 +519,6 @@ def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
         nlp.data.batchify.Stack('float32'),
         nlp.data.batchify.Stack('float32'))
 
-    dev_dataset = preprocess_dataset(tokenizer,
-                                     dev_data,
-                                     max_seq_length=max_seq_length,
-                                     doc_stride=doc_stride,
-                                     max_query_length=max_query_length,
-                                     input_features=False)
-
     dev_data_transform = preprocess_dataset(tokenizer,
                                             dev_data,
                                             max_seq_length=max_seq_length,

From 54a6e3a94186252d20e603cd138288382d2ee9b7 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Sat, 1 Feb 2020 22:47:29 +0800
Subject: [PATCH 27/29] add accuracy to modelzoo

---
 scripts/bert/index.rst | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/scripts/bert/index.rst b/scripts/bert/index.rst
index 0433314ff0..fba74d927c 100644
--- a/scripts/bert/index.rst
+++ b/scripts/bert/index.rst
@@ -188,6 +188,27 @@ To get the score of the dev data, you need to download the dev dataset (`dev-v2.
 
     $ python evaluate-v2.0.py dev-v2.0.json predictions.json
 
+BERT INT8 Quantization
+~~~~~~~~~~~~~~~~~~~~~~
+
+GluonNLP provides the following example scripts to quantize fine-tuned
+BERT models into int8 data type.
+
++--------------+-------------------+-------------------+
+| Dataset      | SQuAD 1.1         | MRPC              |
++==============+===================+===================+
+| Model        | bert_12_768_12    | bert_12_768_12    |
++--------------+-------------------+-------------------+
+| FP32 F1 / EM | 88.58 / 81.18     | 87.01 / 90.97     |
++--------------+-------------------+-------------------+
+| INT8 F1 / EM | 88.10 / 80.32     | 87.01 / 90.88     |
++--------------+-------------------+-------------------+
+| Log          |                   |                   |
++--------------+-------------------+-------------------+
+| Command      |                   |                   |
++--------------+-------------------+-------------------+
+
+For all model settings above, we use a subset of evaluation dataset for calibration.
 
 Pre-training from Scratch
 ~~~~~~~~~~~~~~~~~~~~~~~~~

From 99ab77be67dc0c167b615bd04a3c64d6ab13ccb5 Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Sun, 2 Feb 2020 08:31:28 +0800
Subject: [PATCH 28/29] add SST int8

---
 scripts/bert/index.rst | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/scripts/bert/index.rst b/scripts/bert/index.rst
index a34ee36109..5762caf979 100644
--- a/scripts/bert/index.rst
+++ b/scripts/bert/index.rst
@@ -199,21 +199,18 @@ BERT INT8 Quantization
 ~~~~~~~~~~~~~~~~~~~~~~
 
 GluonNLP provides the following example scripts to quantize fine-tuned
-BERT models into int8 data type.
-
-+--------------+-------------------+-------------------+
-| Dataset      | SQuAD 1.1         | MRPC              |
-+==============+===================+===================+
-| Model        | bert_12_768_12    | bert_12_768_12    |
-+--------------+-------------------+-------------------+
-| FP32 F1 / EM | 88.58 / 81.18     | 87.01 / 90.97     |
-+--------------+-------------------+-------------------+
-| INT8 F1 / EM | 88.10 / 80.32     | 87.01 / 90.88     |
-+--------------+-------------------+-------------------+
-| Log          |                   |                   |
-+--------------+-------------------+-------------------+
-| Command      |                   |                   |
-+--------------+-------------------+-------------------+
+BERT models into int8 data type. Note that INT8 Quantization needs a nightly
+version of mxnet-mkl.
+
++-----------+-------------------+---------+---------+---------+---------+-----+---------+
+|  Dataset  | Model             | FP32 EM | INT8 EM | FP32 F1 | INT8 F1 | Log | Command |
++===========+===================+=========+=========+=========+=========+=====+=========+
+| SQuAD 1.1 | bert_12_768_12    | 81.18   | 80.32   | 88.58   | 88.10   |     |         |
++-----------+-------------------+---------+---------+---------+---------+-----+---------+
+| MRPC      | bert_12_768_12    | 87.01   | 87.01   | 90.97   | 90.88   |     |         |
++-----------+-------------------+---------+---------+---------+---------+-----+---------+
+| SST       | bert_12_768_12    | 93.23   | 93.00   |         |         |     |         |
++-----------+-------------------+---------+---------+---------+---------+-----+---------+
 
 For all model settings above, we use a subset of evaluation dataset for calibration.
 

From 05918ec07567ad177a4cd45aa7e4752f213af3ca Mon Sep 17 00:00:00 2001
From: xinyu-intel <xinyu1.chen@intel.com>
Date: Sun, 2 Feb 2020 22:28:19 +0800
Subject: [PATCH 29/29] address comments

---
 docs/examples/sentence_embedding/bert.md |  4 ++--
 scripts/bert/index.rst                   | 20 +++++++++++++++-----
 src/gluonnlp/calibration/collector.py    | 15 +--------------
 3 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/docs/examples/sentence_embedding/bert.md b/docs/examples/sentence_embedding/bert.md
index 9a1c9cce0c..e0f3506405 100644
--- a/docs/examples/sentence_embedding/bert.md
+++ b/docs/examples/sentence_embedding/bert.md
@@ -297,9 +297,9 @@ for epoch_id in range(num_epochs):
 
 ## Quantize the model
 
-GluonNLP also delivered some int8 quantization methods to improve the performance and reduce the deployment costs for the natural language inference tasks. In real production, there are two main benefits of lower precision (INT8). First, the computation can be accelerated by the low precision instruction, like Intel Vector Neural Network Instruction (VNNI). Second, lower precision data type would save the memory bandwidth and allow for better cache locality and save the power. The new feature can get up to 4X performance speedup in the latest [AWS EC2 C5 instances](https://aws.amazon.com/blogs/aws/now-available-new-c5-instance-sizes-and-bare-metal-instances/) under the [Intel Deep Learning Boost (VNNI)](https://www.intel.ai/intel-deep-learning-boost/) enabled hardware with less than 0.5% accuracy drop.
+GluonNLP also delivered some INT8 quantization methods to improve the performance and reduce the deployment costs for the natural language inference tasks. In real production, there are two main benefits of lower precision (INT8). First, the computation can be accelerated by the low precision instruction, like Intel Vector Neural Network Instruction (VNNI). Second, lower precision data type would save the memory bandwidth and allow for better cache locality and save the power. The new feature can get up to 4X performance speedup in the latest [AWS EC2 C5 instances](https://aws.amazon.com/blogs/aws/now-available-new-c5-instance-sizes-and-bare-metal-instances/) under the [Intel Deep Learning Boost (VNNI)](https://www.intel.ai/intel-deep-learning-boost/) enabled hardware with less than 0.5% accuracy drop.
 
-Now we have a fine-tuned model on MRPC training dataset and in this section, we will quantize the model into int8 data type on a subset of MRPC validation dataset.
+Now we have a fine-tuned model on MRPC training dataset and in this section, we will quantize the model into INT8 data type on a subset of MRPC validation dataset.
 
 ```{.python .input}
 # The hyperparameters
diff --git a/scripts/bert/index.rst b/scripts/bert/index.rst
index 5762caf979..2ad4a224a6 100644
--- a/scripts/bert/index.rst
+++ b/scripts/bert/index.rst
@@ -200,17 +200,27 @@ BERT INT8 Quantization
 
 GluonNLP provides the following example scripts to quantize fine-tuned
 BERT models into int8 data type. Note that INT8 Quantization needs a nightly
-version of mxnet-mkl.
+version of `mxnet-mkl <https://apache-mxnet.s3-us-west-2.amazonaws.com/dist/index.html>`_.
+
+Sentence Classification
++++++++++++++++++++++++
+
++-----------+-------------------+---------------+---------------+---------+---------+-----+---------+
+|  Dataset  | Model             | FP32 Accuracy | INT8 Accuracy | FP32 F1 | INT8 F1 | Log | Command |
++===========+===================+===============+===============+=========+=========+=====+=========+
+| MRPC      | bert_12_768_12    | 87.01         | 87.01         | 90.97   | 90.88   |     |         |
++-----------+-------------------+---------------+---------------+---------+---------+-----+---------+
+| SST-2     | bert_12_768_12    | 93.23         | 93.00         |         |         |     |         |
++-----------+-------------------+---------------+---------------+---------+---------+-----+---------+
+
+Question Answering
+++++++++++++++++++
 
 +-----------+-------------------+---------+---------+---------+---------+-----+---------+
 |  Dataset  | Model             | FP32 EM | INT8 EM | FP32 F1 | INT8 F1 | Log | Command |
 +===========+===================+=========+=========+=========+=========+=====+=========+
 | SQuAD 1.1 | bert_12_768_12    | 81.18   | 80.32   | 88.58   | 88.10   |     |         |
 +-----------+-------------------+---------+---------+---------+---------+-----+---------+
-| MRPC      | bert_12_768_12    | 87.01   | 87.01   | 90.97   | 90.88   |     |         |
-+-----------+-------------------+---------+---------+---------+---------+-----+---------+
-| SST       | bert_12_768_12    | 93.23   | 93.00   |         |         |     |         |
-+-----------+-------------------+---------+---------+---------+---------+-----+---------+
 
 For all model settings above, we use a subset of evaluation dataset for calibration.
 
diff --git a/src/gluonnlp/calibration/collector.py b/src/gluonnlp/calibration/collector.py
index fcbd194482..955031215d 100644
--- a/src/gluonnlp/calibration/collector.py
+++ b/src/gluonnlp/calibration/collector.py
@@ -15,21 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint:disable=redefined-outer-name,logging-format-interpolation
-"""
-Bert layer output collector with threshold clipping for calibration
-===================================================================
+"""Bert layer output collector with threshold clipping for calibration"""
 
-This collector is designed for collect and clip the layer outputs of bert
-while calibration with Gluon NLP Toolkit.
-
-@article{devlin2018bert,
-  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
-  author={Devlin, Jacob and Chang, Ming- \
-      Wei and Lee, Kenton and Toutanova, Kristina},
-  journal={arXiv preprint arXiv:1810.04805},
-  year={2018}
-}
-"""
 import ctypes
 from mxnet import ndarray
 from mxnet.base import NDArrayHandle, py_str