diff --git a/README.md b/README.md index 83ccff0..a117992 100644 --- a/README.md +++ b/README.md @@ -257,7 +257,7 @@ An original FP32 source model is quantized either using post-training quantizati Pose Estimation Based on Ref. Based on Ref. - Quantized Model + Quantized Model (COCO) mAP
FP32: 0.364
INT8: 0.359
mAR
FP32: 0.436
INT8: 0.432 PoseEstimation.md diff --git a/zoo_torch/Docs/PoseEstimation.md b/zoo_torch/Docs/PoseEstimation.md index 78b5ef4..20c1b33 100644 --- a/zoo_torch/Docs/PoseEstimation.md +++ b/zoo_torch/Docs/PoseEstimation.md @@ -21,7 +21,7 @@ RUN pip install scipy==1.1.0 ## Obtaining model weights and dataset - The pose estimation model can be downloaded here: - - + - Pose Estimation pytorch model - coco dataset can be downloaded here: @@ -39,3 +39,10 @@ RUN pip install scipy==1.1.0 - We only support evaluation on COCO 2014 val images with person keypoints. - The results reported was evaluation on the whole dataset, which contains over 40k images and takes 15+ hours on a single RTX 2080Ti GPU. So in case you want to run a faster evaluation, specifiy num_imgs argument to the second call with a small number to evaluate_session so that you run evaluation only on a partial dataset. + +## Quantizer Op Assumptions +In the evaluation script included, we have used the default config file, which configures the quantizer ops with the following assumptions: +- Weight quantization: 8 bits, asymmetric quantization +- Bias parameters are not quantized +- Activation quantization: 8 bits, asymmetric quantization +- Model inputs are not quantized diff --git a/zoo_torch/examples/pose_estimation_quanteval.py b/zoo_torch/examples/pose_estimation_quanteval.py index fc4af80..d10a2da 100644 --- a/zoo_torch/examples/pose_estimation_quanteval.py +++ b/zoo_torch/examples/pose_estimation_quanteval.py @@ -26,6 +26,7 @@ import cv2 from scipy.ndimage.filters import gaussian_filter import torch +import torch.nn as nn import numpy as np from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval @@ -33,6 +34,202 @@ from aimet_torch import quantsim +def get_pre_stage_net(): + network_dict = {'block_pre_stage': [{'sequential_CPM': + [[512, 256, (3, 1), 1, (1, 0), False], + [256, 256, (1, 3), 1, (0, 1)]]}, + {'conv4_4_CPM': [256, 128, 3, 1, 1]}]} + return network_dict + + +def get_shared_network_dict(): + network_dict = get_pre_stage_net() + stage_channel = [0, 128, 185, 185, 185, 185, 185] + shared_channel = [0, 112, 128] + sequential4_channel = [0, 32, 48] + for i in range(1, 3): + network_dict['block%d_shared' % i] = \ + [{'sequential1_stage%d_L1' % i: + [[stage_channel[i], shared_channel[i], (7, 1), 1, (3, 0), False], + [shared_channel[i], 128, (1, 7), 1, (0, 3)]]}, + {'sequential2_stage%d_L1' % i: + [[128, 112, (7, 1), 1, (3, 0), False], + [112, 128, (1, 7), 1, (0, 3)]]}] + + network_dict['block%d_1' % i] = [{'sequential3_stage%d_L1' % i: + [[128, 32, (3, 1), 1, (1, 0), False], + [32, 128, (1, 3), 1, (0, 1)]]}, + {'sequential4_stage%d_L1' % i: + [[128, 32, (3, 1), 1, (1, 0), False], + [32, 128, (1, 3), 1, (0, 1)]]}, + {'sequential5_stage%d_L1' % i: + [[128, 32, (3, 1), 1, (1, 0), False], + [32, 128, (1, 3), 1, (0, 1)]]}, + {'Mconv6_stage%d_L1' % i: [128, 128, 1, 1, 0]}, + {'Mconv7_stage%d_L1' % i: [128, 38, 1, 1, 0]}] + network_dict['block%d_2' % i] = [{'sequential3_stage%d_L1' % i: + [[128, 32, (3, 1), 1, (1, 0), False], + [32, 128, (1, 3), 1, (0, 1)]]}, + {'sequential4_stage%d_L1' % i: + [[128, sequential4_channel[i], (3, 1), 1, (1, 0), False], + [sequential4_channel[i], 128, (1, 3), 1, (0, 1)]]}, + {'sequential5_stage%d_L1' % i: + [[128, 48, (3, 1), 1, (1, 0), False], + [48, 128, (1, 3), 1, (0, 1)]]}, + {'Mconv6_stage%d_L2' % i: [128, 128, 1, 1, 0]}, + {'Mconv7_stage%d_L2' % i: [128, 19, 1, 1, 0]}] + return network_dict + + +def get_model(upsample=False): + block0 = [{'conv0': [3, 32, 3, 1, 1]}, + {'sequential1': + [[32, 16, (3, 1), 1, (1, 0), False], + [16, 32, (1, 3), 1, (0, 1)]]}, {'pool1_stage1': [2, 2, 0]}, + {'sequential2': + [[32, 32, (3, 1), 1, (1, 0), False], + [32, 64, (1, 3), 1, (0, 1)]]}, + {'sequential3': + [[64, 32, (3, 1), 1, (1, 0), False], + [32, 96, (1, 3), 1, (0, 1)]]}, {'pool2_stage1': [2, 2, 0]}, + {'sequential4': + [[96, 80, (3, 1), 1, (1, 0), False], + [80, 256, (1, 3), 1, (0, 1)]]}, + {'sequential5': + [[256, 80, (3, 1), 1, (1, 0), False], + [80, 256, (1, 3), 1, (0, 1)]]}, + {'sequential6': + [[256, 48, (3, 1), 1, (1, 0), False], + [48, 128, (1, 3), 1, (0, 1)]]}, + {'sequential7': + [[128, 48, (3, 1), 1, (1, 0), False], + [48, 256, (1, 3), 1, (0, 1)]]}, {'pool3_stage1': [2, 2, 0]}, + {'sequential8': + [[256, 96, (3, 1), 1, (1, 0), False], + [96, 512, (1, 3), 1, (0, 1)]]}, + {'sequential9': + [[512, 192, (3, 1), 1, (1, 0), False], + [192, 512, (1, 3), 1, (0, 1)]]}] + + + print("defining network with shared weights") + network_dict = get_shared_network_dict() + + def define_base_layers(block, layer_size): + layers = [] + for i in range(layer_size): + one_ = block[i] + for k, v in zip(one_.keys(), one_.values()): + if 'pool' in k: + layers += [nn.MaxPool2d(kernel_size=v[0], stride=v[1], padding=v[2])] + elif 'sequential' in k: + conv2d_1 = nn.Conv2d(in_channels=v[0][0], out_channels=v[0][1], kernel_size=v[0][2], + stride=v[0][3], padding=v[0][4], bias=v[0][5]) + conv2d_2 = nn.Conv2d(in_channels=v[1][0], out_channels=v[1][1], kernel_size=v[1][2], + stride=v[1][3], padding=v[1][4]) + sequential = nn.Sequential(conv2d_1, conv2d_2) + layers += [sequential, nn.ReLU(inplace=True)] + else: + conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1], kernel_size=v[2], + stride=v[3], padding=v[4]) + layers += [conv2d, nn.ReLU(inplace=True)] + return layers + + def define_stage_layers(cfg_dict): + layers = define_base_layers(cfg_dict, len(cfg_dict) - 1) + one_ = cfg_dict[-1].keys() + k = list(one_)[0] + v = cfg_dict[-1][k] + conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1], kernel_size=v[2], stride=v[3], + padding=v[4]) + layers += [conv2d] + return nn.Sequential(*layers) + + # create all the layers of the model + base_layers = define_base_layers(block0, len(block0)) + pre_stage_layers = define_base_layers(network_dict['block_pre_stage'], + len(network_dict['block_pre_stage'])) + models = {'block0': nn.Sequential(*base_layers), + 'block_pre_stage': nn.Sequential(*pre_stage_layers)} + + shared_layers_s1 = define_base_layers(network_dict['block1_shared'], + len(network_dict['block1_shared'])) + shared_layers_s2 = define_base_layers(network_dict['block2_shared'], + len(network_dict['block2_shared'])) + models['block1_shared'] = nn.Sequential(*shared_layers_s1) + models['block2_shared'] = nn.Sequential(*shared_layers_s2) + + for k, v in zip(network_dict.keys(), network_dict.values()): + if 'shared' not in k and 'pre_stage' not in k: + models[k] = define_stage_layers(v) + + model = PoseModel(models, upsample=upsample) + return model + + +class PoseModel(nn.Module): + """ + + CMU pose estimation model. + + Based on: "Realtime Multi-Person 2D Pose Estimation using Part Affinity Fields": + https://arxiv.org/pdf/1611.08050.pdf + + Made lighter and more efficient by Amir (ahabibian@qti.qualcomm.com) in the + Morpheus team. + + Some layers of the original commented out to reduce model complexity + + """ + def __init__(self, model_dict, upsample=False): + super(PoseModel, self).__init__() + self.upsample = upsample + self.basemodel = model_dict['block0'] + self.pre_stage = model_dict['block_pre_stage'] + + self.stage1_shared = model_dict['block1_shared'] + self.stage1_1 = model_dict['block1_1'] + self.stage2_1 = model_dict['block2_1'] + + self.stage2_shared = model_dict['block2_shared'] + self.stage1_2 = model_dict['block1_2'] + self.stage2_2 = model_dict['block2_2'] + + def forward(self, x): + out1_vgg = self.basemodel(x) + out1 = self.pre_stage(out1_vgg) + + out1_shared = self.stage1_shared(out1) + out1_1 = self.stage1_1(out1_shared) + out1_2 = self.stage1_2(out1_shared) + + out2 = torch.cat([out1_1, out1_2, out1], 1) + + out2_shared = self.stage2_shared(out2) + out2_1 = self.stage2_1(out2_shared) + out2_2 = self.stage2_2(out2_shared) + + if self.upsample: + # parameters to check for up-sampling: align_corners = True, mode='nearest' + upsampler = nn.Upsample(scale_factor=2, mode='bilinear') + out2_1_up = upsampler(out2_1) + out2_2_up = upsampler(out2_2) + return out1_1, out1_2, out2_1, out2_2, out2_1_up, out2_2_up + else: + return out1_1, out1_2, out2_1, out2_2 + + +class ModelBuilder(object): + def __init__(self, upsample=False): + self.model = None + self.upsample = upsample + + def create_model(self): + model = get_model(self.upsample) + self.model = model + return self.model + + def non_maximum_suppression(map, thresh): map_s = gaussian_filter(map, sigma=3) @@ -450,7 +647,7 @@ def parse_args(): parser.add_argument('model_dir', help='The location where the the .pth file is saved,' - 'the whole model should be saved by torch.save()', + 'the .pth contains model weights', type=str) parser.add_argument('coco_path', help='The location coco images and annotations are saved. ' @@ -476,7 +673,15 @@ def parse_args(): def pose_estimation_quanteval(args): # load the model checkpoint from meta - model = torch.load(args.model_dir) + model_builder = ModelBuilder() + model_builder.create_model() + model = model_builder.model + + state_dict = torch.load(args.model_dir) + state = model.state_dict() + state.update(state_dict) + + model.load_state_dict(state) # create quantsim object which inserts quant ops between layers sim = quantsim.QuantizationSimModel(model, @@ -484,7 +689,7 @@ def pose_estimation_quanteval(args): quant_scheme=args.quant_scheme) evaluate = partial(evaluate_model, - num_imgs=100 + num_imgs=500 ) sim.compute_encodings(evaluate, args.coco_path)