dmlc · hetong007 · Aug 5, 2019 · Jul 17, 2019 · Jul 23, 2019 · Jul 24, 2019
diff --git a/docs/tutorials/deployment/int8_inference.py b/docs/tutorials/deployment/int8_inference.py
@@ -11,13 +11,14 @@
 
 GluonCV delivered some quantized models to improve the performance and reduce the deployment costs for the computer vision inference tasks. In real production, there are two main benefits of lower precision (INT8). First, the computation can be accelerated by the low precision instruction, like Intel Vector Neural Network Instruction (VNNI). Second, lower precision data type would save the memory bandwidth and allow for better cache locality and save the power. The new feature can get up to 4X performance speedup in the latest `AWS EC2 C5 instances <https://aws.amazon.com/blogs/aws/now-available-new-c5-instance-sizes-and-bare-metal-instances/>`_ under the `Intel Deep Learning Boost (VNNI) <https://www.intel.ai/intel-deep-learning-boost/>`_ enabled hardware with less than 0.5% accuracy drop.
 
-Please checkout `verify_pretrained.py <https://raw.githubusercontent.com/dmlc/gluon-cv/master/scripts/classification/imagenet/verify_pretrained.py>`_ for imagenet inference
-and `eval_ssd.py <https://raw.githubusercontent.com/dmlc/gluon-cv/master/scripts/detection/ssd/eval_ssd.py>`_ for SSD inference.
+Please checkout `verify_pretrained.py <https://raw.githubusercontent.com/dmlc/gluon-cv/master/scripts/classification/imagenet/verify_pretrained.py>`_ for imagenet inference,
+`eval_ssd.py <https://raw.githubusercontent.com/dmlc/gluon-cv/master/scripts/detection/ssd/eval_ssd.py>`_ for SSD inference, and `test.py <https://raw.githubusercontent.com/dmlc/gluon-cv/master/scripts/segmentation/test.py>`_ 
+for FCN inference.
 
 Performance
 -----------
 
-GluonCV supports some quantized classification models and detection models.
+GluonCV supports some quantized classification models, detection models and segmentation models.
 For the throughput, the target is to achieve the maximum machine efficiency to combine the inference requests together and get the results by one iteration. From the bar-chart, it is clearly that the fusion and quantization approach improved the throughput from 3.22X to 7.24X for selected models.
 Below CPU performance is collected with dummy input from AWS EC2 C5.24xlarge instance with 24 physical cores.
 
@@ -41,8 +42,13 @@
 +-----------------------+----------+------------+------------------+------------------+---------+-----------------+-----------------+
 | SSD-mobilenet1.0 512* | VOC      | 224        | 65.97            | 212.59           | 3.22    | 75.42           | 74.70           |
 +-----------------------+----------+------------+------------------+------------------+---------+-----------------+-----------------+
+| FCN_resnet101         | VOC      | 1          | 5.09             | 21.12            | 4.15    | 97.97%          | 95.71%          |
++-----------------------+----------+------------+------------------+------------------+---------+-----------------+-----------------+
+| FCN_resnet101         | COCO     | 1          | 5.10             | 25.98            | 5.09    | 91.28%          | 90.10%          |
++-----------------------+----------+------------+------------------+------------------+---------+-----------------+-----------------+
+
 
-Quantized SSD models are evaluated with ``nms_thresh=0.45``, ``nms_topk=200``.
+Quantized SSD models are evaluated with ``nms_thresh=0.45``, ``nms_topk=200``. For FCN models, the accuracy metric is pixAcc.
 
 Demo usage for SSD
 ------------------

diff --git a/gluoncv/model_zoo/model_zoo.py b/gluoncv/model_zoo/model_zoo.py
@@ -219,6 +219,8 @@
     'alpha_pose_resnet101_v1b_coco': alpha_pose_resnet101_v1b_coco,
     'vgg16_ucf101': vgg16_ucf101,
     'inceptionv3_ucf101': inceptionv3_ucf101,
+    'fcn_resnet101_voc_int8': fcn_resnet101_voc_int8,
+    'fcn_resnet101_coco_int8':fcn_resnet101_coco_int8,
 }
 
 

diff --git a/gluoncv/model_zoo/quantized/quantized.py b/gluoncv/model_zoo/quantized/quantized.py
diff --git a/gluoncv/utils/compress_json.py b/gluoncv/utils/compress_json.py
@@ -0,0 +1,53 @@
+# pylint: disable=line-too-long
+"""Encode/Decode helper function for compressed quantized models"""
+import zlib
+import base64
+
+def encode_json(json_file, is_print=False):
+    r""" Encode json string to compressed base64 string.
+
+    Parameters
+    ----------
+    json_file : str
+        String value represents the path to json file.
+    is_print : bool
+        Boolean value controls whether to print the encoded base64 string.
+    """
+    with open(json_file, encoding='utf-8') as fh:
+        data = fh.read()
+    zipped_str = zlib.compress(data.encode('utf-8'))
+    b64_str = base64.b64encode(zipped_str)
+    if is_print:
+        print(b64_str)
+    return b64_str
+
+def decode_b64(b64_str, is_print=False):
+    r""" Decode b64 string to json format
+
+    Parameters
+    ---------
+    b64_str: str
+        String value represents the compressed base64 string.
+    is_print : bool
+        Boolean value controls whether to print the decoded json string.
+    """
+    json_str = zlib.decompress(base64.b64decode(b64_str)).decode('utf-8')
+    if is_print:
+        print(json_str)
+    return json_str
+
+def get_compressed_model(model_name, compressed_json):
+    r""" Get compressed (INT8) models from existing `compressed_json` dict
+
+    Parameters
+    ----------
+    model_name: str
+        String value represents the name of compressed (INT8) model.
+    compressed_json : dict
+        Dictionary's key represents the name of (INT8) model, and dictionary's value
+        represents the compressed json string of (INT8) model.
+    """
+    b64_str = compressed_json.get(model_name, None)
+    if b64_str:
+        return decode_b64(b64_str)
+    raise ValueError('Model: {} is not found. Available compressed models are:\n{}'.format(model_name, '\n'.join(list(compressed_json.keys()))))
diff --git a/scripts/segmentation/test.py b/scripts/segmentation/test.py
@@ -1,9 +1,12 @@
 import os
 from tqdm import tqdm
 import numpy as np
+import argparse
+import time
+import sys
 
 import mxnet as mx
-from mxnet import gluon
+from mxnet import gluon, ndarray as nd
 from mxnet.gluon.data.vision import transforms
 
 import gluoncv
@@ -12,19 +15,66 @@
 from gluoncv.data import get_segmentation_dataset, ms_batchify_fn
 from gluoncv.utils.viz import get_color_pallete
 
-from train import parse_args
+def parse_args():
+    parser = argparse.ArgumentParser(description='Validation on Segmentation model')
+    # model and dataset
+    parser.add_argument('--model', type=str, default='fcn',
+                        help='model name (default: fcn)')
+    parser.add_argument('--backbone', type=str, default='resnet101',
+                        help='base network')
+    parser.add_argument('--image-shape', type=int, default=480,
+                        help='image shape')
+    parser.add_argument('--base-size', type=int, default=520,
+                        help='base image size')
+    parser.add_argument('--crop-size', type=int, default=480,
+                        help='crop image size')
+    parser.add_argument('--mode', type=str, default='val',
+                        help='val, testval')
+    parser.add_argument('--dataset', type=str, default='pascal_voc',
+                        help='dataset used for validation [pascal_voc, pascal_aug, coco, ade20k]')
+    parser.add_argument('--quantized', action='store_true', 
+                        help='whether to use quantized model')
+    parser.add_argument('--batch-size', type=int, default=16)
+    parser.add_argument('--num-iterations', type=int, default=100,
+                        help='number of benchmarking iterations.')
+    parser.add_argument('--workers', type=int, default=4,
+                        help='number of workers for data loading')
+    parser.add_argument('--pretrained', action="store_true",
+                        help='whether to use pretrained params')
+    parser.add_argument('--ngpus', type=int,
+                        default=len(mx.test_utils.list_gpus()),
+                        help='number of GPUs (default: 4)')
+    parser.add_argument('--aux', action='store_true', default=False,
+                        help='Auxiliary loss')
+    # synchronized Batch Normalization
+    parser.add_argument('--syncbn', action='store_true', default=False,
+                        help='using Synchronized Cross-GPU BatchNorm')
+    parser.add_argument('--resume', type=str, default=None,
+                        help='put the path to resuming file if needed')
+    # evaluation only
+    parser.add_argument('--eval', action='store_true', default=False,
+                        help='evaluation only')
+    # dummy benchmark
+    parser.add_argument('--benchmark', action='store_true', default=False,
+                        help='whether to use dummy data for benchmark')
 
-def test(args):
+    args = parser.parse_args()
+
+    args.ctx = [mx.cpu(0)]
+    args.ctx = [mx.gpu(i) for i in range(args.ngpus)] if args.ngpus > 0 else args.ctx
+
+    args.norm_layer = mx.gluon.contrib.nn.SyncBatchNorm if args.syncbn \
+        else mx.gluon.nn.BatchNorm
+    args.norm_kwargs = {'num_devices': args.ngpus} if args.syncbn else {}
+    return args
+
+
+def test(model, args, input_transform):
     # output folder
     outdir = 'outdir'
     if not os.path.exists(outdir):
         os.makedirs(outdir)
-    # image transform
-    input_transform = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
-    ])
-    # dataset and dataloader
+    # get dataset
     if args.eval:
         testset = get_segmentation_dataset(
             args.dataset, split='val', mode='testval', transform=input_transform)
@@ -34,23 +84,8 @@ def test(args):
         testset = get_segmentation_dataset(
             args.dataset, split='test', mode='test', transform=input_transform)
     test_data = gluon.data.DataLoader(
-        testset, args.test_batch_size, shuffle=False, last_batch='keep',
+        testset, args.batch_size, shuffle=False, last_batch='keep',
         batchify_fn=ms_batchify_fn, num_workers=args.workers)
-    # create network
-    if args.model_zoo is not None:
-        model = get_model(args.model_zoo, pretrained=True)
-    else:
-        model = get_segmentation_model(model=args.model, dataset=args.dataset, ctx=args.ctx,
-                                       backbone=args.backbone, norm_layer=args.norm_layer,
-                                       norm_kwargs=args.norm_kwargs, aux=args.aux,
-                                       base_size=args.base_size, crop_size=args.crop_size)
-        # load pretrained weight
-        assert args.resume is not None, '=> Please provide the checkpoint using --resume'
-        if os.path.isfile(args.resume):
-            model.load_parameters(args.resume, ctx=args.ctx)
-        else:
-            raise RuntimeError("=> no checkpoint found at '{}'" \
-                .format(args.resume))
     print(model)
     evaluator = MultiEvalModel(model, testset.num_class, ctx_list=args.ctx)
     metric = gluoncv.utils.metrics.SegmentationMetric(testset.num_class)
@@ -63,7 +98,7 @@ def test(args):
                        for target in dsts]
             metric.update(targets, predicts)
             pixAcc, mIoU = metric.get()
-            tbar.set_description( 'pixAcc: %.4f, mIoU: %.4f' % (pixAcc, mIoU))
+            tbar.set_description('pixAcc: %.4f, mIoU: %.4f' % (pixAcc, mIoU))
         else:
             im_paths = dsts
             predicts = evaluator.parallel_forward(data)
@@ -74,8 +109,135 @@ def test(args):
                 outname = os.path.splitext(impath)[0] + '.png'
                 mask.save(os.path.join(outdir, outname))
 
+
+def test_quantization(model, args, input_transform):
+    # output folder
+    outdir = 'outdir_int8'
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+    # hybridize
+    model.hybridize(static_alloc=True, static_shape=True)
+
+    # get dataset
+    if args.eval:
+        testset = get_segmentation_dataset(
+            args.dataset, split='val', mode=args.mode, transform=input_transform)
+    else:
+        testset = get_segmentation_dataset(
+            args.dataset, split='test', mode=args.mode, transform=input_transform)
+    size = len(testset)
+    batchify_fn = ms_batchify_fn if testset.mode == 'test' else None
+    test_data = gluon.data.DataLoader(
+            testset, args.batch_size, batchify_fn=batchify_fn, last_batch='keep',
+            shuffle=False, num_workers=args.workers)
+    print(model)
+    metric = gluoncv.utils.metrics.SegmentationMetric(testset.num_class)
+
+    tbar = tqdm(test_data)
+    metric.reset()
+    tic = time.time()
+    for i, (batch, dsts) in enumerate(tbar):
+        if args.eval:
+            targets = mx.gluon.utils.split_and_load(dsts, ctx_list=args.ctx, even_split=False)
+            data = mx.gluon.utils.split_and_load(batch, ctx_list=args.ctx, batch_axis=0, even_split=False)
+            outputs = None
+            for x in data:
+                output = model(x)
+                outputs = output if outputs is None else nd.concat(outputs, output, axis=0)
+            metric.update(targets, outputs)
+            pixAcc, mIoU = metric.get()
+            tbar.set_description('pixAcc: %.4f, mIoU: %.4f' % (pixAcc, mIoU))
+        else:
+            for data, impath in zip(batch, dsts):
+                data = data.as_in_context(args.ctx[0])
+                if len(data.shape) < 4:
+                    data = nd.expand_dims(data, axis=0)
+                predict = model(data)[0]
+                predict = mx.nd.squeeze(mx.nd.argmax(predict, 1)).asnumpy() + \
+                    testset.pred_offset
+                mask = get_color_pallete(predict, args.dataset)
+                outname = os.path.splitext(impath)[0] + '.png'
+                mask.save(os.path.join(outdir, outname))
+    speed = size / (time.time() - tic)
+    print('Inference speed with batchsize %d is %.2f img/sec' % (args.batch_size, speed))
+
+
+def benchmarking(model, args):
+    if args.quantized:
+        model.hybridize(static_alloc=True, static_shape=True)
+    else:
+        model.hybridize()
+
+    bs = args.batch_size
+    num_iterations = args.num_iterations
+    input_shape = (bs, 3, args.image_shape, args.image_shape)
+    size = num_iterations * bs
+    data = mx.random.uniform(-1.0, 1.0, shape=input_shape, ctx=args.ctx[0], dtype='float32')
+    dry_run = 5
+    with tqdm(total=size+dry_run*bs) as pbar:
+        for n in range(dry_run + num_iterations):
+            if n == dry_run:
+                tic = time.time()
+            outputs = model(data)
+            for output in outputs:
+                output.wait_to_read()
+            pbar.update(bs)
+    speed = size / (time.time() - tic)
+    print('Throughput is %f imgs/sec' % speed)
+
+
 if __name__ == "__main__":
     args = parse_args()
-    args.test_batch_size = args.ngpus
+
+    withQuantization = False
+    model_prefix = args.model + '_' + args.backbone
+    if 'pascal' in args.dataset:
+        model_prefix += '_voc'
+        withQuantization = True if (args.backbone in ['resnet101'] and args.ngpus == 0) else withQuantization
+    elif args.dataset == 'coco':
+        model_prefix += '_coco'
+        withQuantization = True if (args.backbone in ['resnet101'] and args.ngpus == 0) else withQuantization
+    elif args.dataset == 'ade20k':
+        model_prefix += 'ade'
+    elif args.dataset == 'citys':
+        model_prefix += 'citys'
+    else:
+        raise ValueError('Unsupported dataset {} used'.format(args.dataset))
+
+    if withQuantization and args.quantized:
+        model_prefix += '_int8'
+
+    # create network
+    if args.pretrained:
+        model = get_model(model_prefix, pretrained=True)
+        model.collect_params().reset_ctx(ctx=args.ctx)
+    else:
+        assert "_in8" not in model_prefix, "Currently, Int8 models are not supported when pretrained=False"
+        model = get_segmentation_model(model=args.model, dataset=args.dataset, ctx=args.ctx,
+                                       backbone=args.backbone, norm_layer=args.norm_layer,
+                                       norm_kwargs=args.norm_kwargs, aux=args.aux,
+                                       base_size=args.base_size, crop_size=args.crop_size)
+        # load local pretrained weight
+        assert args.resume is not None, '=> Please provide the checkpoint using --resume'
+        if os.path.isfile(args.resume):
+            model.load_parameters(args.resume, ctx=args.ctx)
+        else:
+            raise RuntimeError("=> no checkpoint found at '{}'" \
+                .format(args.resume))
+
+    print("Successfully loaded %s model" % model_prefix)
     print('Testing model: ', args.resume)
-    test(args)
+    # image transform
+    input_transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize([.485, .456, .406], [.229, .224, .225]),
+    ])
+
+    if not args.benchmark:
+        if '_int8' in model_prefix:
+            test_quantization(model, args, input_transform)
+        else:
+            test(model, args, input_transform)
+    else:
+        print('-----benchmarking on %s -----' % model_prefix)
+        benchmarking(model, args)
diff --git a/tests/unittests/test_model_zoo.py b/tests/unittests/test_model_zoo.py
@@ -411,6 +411,14 @@ def test_quantized_ssd_models():
     _test_model_list(model_list, ctx, x)
 
 
+@with_cpu(0)
+def test_quantized_fcn_models():
+    model_list = ['fcn_resnet101_voc_int8', 'fcn_resnet101_coco_int8']
+    ctx = mx.context.current_context()
+    x = mx.random.uniform(shape=(1, 3, 480, 480), ctx=ctx)
+    _test_model_list(model_list, ctx, x)
+
+
 if __name__ == '__main__':
     import nose