diff --git a/examples/pytorch/image_recognition/segment_anything/README.md b/examples/pytorch/image_recognition/segment_anything/README.md new file mode 100644 index 00000000000..1a8f538d8e1 --- /dev/null +++ b/examples/pytorch/image_recognition/segment_anything/README.md @@ -0,0 +1,68 @@ +Step-by-Step +============ +This document describes the step-by-step instructions for applying post training quantization on Segment Anything Model (SAM) using VOC dataset. + +# Prerequisite +## Environment +```shell +# install dependencies +pip install -r ./requirements.txt +# retrieve SAM model codes and pre-trained weight +pip install git+https://github.com/facebookresearch/segment-anything.git +wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth +``` + +# PTQ +PTQ example on Segment Anything Model (SAM) using VOC dataset. + +## 1. Prepare VOC dataset +```shell +python download_dataset.py +``` + +## 2. Start PTQ +```shell +bash run_quant.sh --voc_dataset_location=./voc_dataset/VOCdevkit/VOC2012/ --pretrained_weight_location=./sam_vit_b_01ec64.pth +``` + +## 3. Benchmarking +```shell +bash run_benchmark.sh --tuned_checkpoint=./saved_results --voc_dataset_location=./voc_dataset/VOCdevkit/VOC2012/ --int8=True --mode=performance +``` + +# Result +| | Baseline (FP32) | INT8 +| ------------- | ------------- | ------------- +Accuracy | 0.7939 | 0.7849 + +# Saving and Loading Model + +* Saving model: + After tuning with Neural Compressor, we can get neural_compressor.model: + +``` +from neural_compressor import PostTrainingQuantConfig +from neural_compressor import quantization +conf = PostTrainingQuantConfig() +q_model = quantization.fit(model, + conf, + calib_dataloader=val_loader, + eval_func=eval_func) +``` + +Here, `q_model` is the Neural Compressor model class, so it has "save" API: + +```python +q_model.save("Path_to_save_quantized_model") +``` + +* Loading model: + +```python +from neural_compressor.utils.pytorch import load +quantized_model = load(os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), + model, + dataloader=val_loader) +``` + +Please refer to main.py for reference. diff --git a/examples/pytorch/image_recognition/segment_anything/download_dataset.py b/examples/pytorch/image_recognition/segment_anything/download_dataset.py new file mode 100644 index 00000000000..9547f49d567 --- /dev/null +++ b/examples/pytorch/image_recognition/segment_anything/download_dataset.py @@ -0,0 +1,7 @@ +import torchvision + +print("Downloading VOC dataset") +torchvision.datasets.VOCDetection(root='./voc_dataset', year='2012', image_set ='trainval', download=True) + + + diff --git a/examples/pytorch/image_recognition/segment_anything/inc_dataset_loader.py b/examples/pytorch/image_recognition/segment_anything/inc_dataset_loader.py new file mode 100644 index 00000000000..678f5c15df5 --- /dev/null +++ b/examples/pytorch/image_recognition/segment_anything/inc_dataset_loader.py @@ -0,0 +1,173 @@ +from segment_anything import SamPredictor, sam_model_registry +import torchvision +import torch +from PIL import Image + +import numpy as np +import os +import xml.etree.ElementTree as ET +from statistics import mean +from torch.nn.functional import threshold, normalize +import torch.nn.functional as F +from segment_anything.utils.transforms import ResizeLongestSide +from typing import List, Tuple + +# Pad image - based on SAM +def pad_image(x: torch.Tensor, square_length = 1024) -> torch.Tensor: + # C, H, W + h, w = x.shape[-2:] + padh = square_length - h + padw = square_length - w + x = F.pad(x, (0, padw, 0, padh)) + return x + +# Custom dataset +class INC_SAMVOC2012Dataset(object): + def __init__(self, voc_root, type): + self.voc_root = voc_root + self.num_of_data = -1 + self.dataset = {} # Item will be : ["filename", "class_name", [4x bounding boxes coordinates], etc) + self.resizelongestside = ResizeLongestSide(target_length=1024) + pixel_mean = [123.675, 116.28, 103.53] + pixel_std = [58.395, 57.12, 57.375] + self.pixel_mean = torch.Tensor(pixel_mean).view(-1, 1, 1) + self.pixel_std = torch.Tensor(pixel_std).view(-1, 1, 1) + + # Read through all the samples and output a dictionary + # Key of the dictionary will be idx + # Item of the dictionary will be filename, class id and bounding boxes + annotation_dir = os.path.join(voc_root, "Annotations") + files = os.listdir(annotation_dir) + files = [f for f in files if os.path.isfile(annotation_dir+'/'+f)] #Filter directory + annotation_files = [os.path.join(annotation_dir, x) for x in files] + + # Get the name list of the segmentation files + segmentation_dir = os.path.join(voc_root, "SegmentationObject") + files = os.listdir(segmentation_dir) + files = [f for f in files if os.path.isfile(segmentation_dir+'/'+f)] #Filter directory + segmentation_files = [x for x in files] + + + # Based on the type (train/val) to select data + train_val_dir = os.path.join(voc_root, 'ImageSets/Segmentation/') + if type == 'train': + txt_file_name = 'train.txt' + elif type =='val': + txt_file_name = 'val.txt' + else: + print('Error! Type of dataset should be ''train'' or ''val'' ') + + with open(train_val_dir + txt_file_name, 'r') as f: + permitted_files = [] + for row in f: + permitted_files.append(row.rstrip('\n')) + + for file in annotation_files: + file_name = file.split('/')[-1].split('.xml')[0] + + if not(file_name in permitted_files): + continue #skip the file + + if file_name + '.png' in segmentation_files: # check that if there is any related segmentation file for this annotation + tree = ET.parse(file) + root = tree.getroot() + for child in root: + if child.tag == 'object': + details = [file_name] + for node in child: + if node.tag == 'name': + object_name = node.text + if node.tag == 'bndbox': + for coordinates in node: + if coordinates.tag == 'xmax': + xmax = int(coordinates.text) + if coordinates.tag == 'xmin': + xmin = int(coordinates.text) + if coordinates.tag == 'ymax': + ymax = int(coordinates.text) + if coordinates.tag == 'ymin': + ymin = int(coordinates.text) + boundary = [xmin, ymin, xmax, ymax] + details.append(object_name) + details.append(boundary) + self.num_of_data += 1 + self.dataset[self.num_of_data] = details + + def __len__(self): + return self.num_of_data + + # Preprocess the segmentation mask. Output only 1 object semantic information. + def preprocess_segmentation(self, filename, bounding_box, pad=True): + + #read the semantic mask + segment_mask = Image.open(self.voc_root + 'SegmentationObject/' + filename + '.png') + segment_mask_np = torchvision.transforms.functional.pil_to_tensor(segment_mask) + + #Crop the segmentation based on the bounding box + xmin, ymin = int(bounding_box[0]), int(bounding_box[1]) + xmax, ymax = int(bounding_box[2]), int(bounding_box[3]) + cropped_mask = segment_mask.crop((xmin, ymin, xmax, ymax)) + cropped_mask_np = torchvision.transforms.functional.pil_to_tensor(cropped_mask) + + #Count the majority element + bincount = np.bincount(cropped_mask_np.reshape(-1)) + bincount[0] = 0 #Remove the black pixel + if (bincount.shape[0] >= 256): + bincount[255] = 0 #Remove the white pixel + majority_element = bincount.argmax() + + #Based on the majority element, binary mask the segmentation + segment_mask_np[np.where((segment_mask_np != 0) & (segment_mask_np != majority_element))] = 0 + segment_mask_np[segment_mask_np == majority_element] = 1 + + #Pad the segment mask to 1024x1024 (for batching in dataloader) + if pad: + segment_mask_np = pad_image(segment_mask_np) + + return segment_mask_np + + # Preprocess the image to an appropriate format for SAM + def preprocess_image(self, img): + # ~= predictor.py - set_image() + img = np.array(img) + input_image = self.resizelongestside.apply_image(img) + input_image_torch = torch.as_tensor(input_image, device='cpu') + input_image_torch = input_image_torch.permute(2, 0, 1).contiguous() + input_image_torch = (input_image_torch - self.pixel_mean) / self.pixel_std #normalize + original_size = img.shape[:2] + input_size = tuple(input_image_torch.shape[-2:]) + + return pad_image(input_image_torch), original_size, input_size + + def __getitem__(self, idx): + data = self.dataset[idx] + filename, classname = data[0], data[1] + bounding_box = data[2] + + # No padding + preprocessing + mask_gt = self.preprocess_segmentation(filename, bounding_box, pad=False) + + image, original_size, input_size = self.preprocess_image(Image.open(self.voc_root + 'JPEGImages/' + filename + '.jpg')) # read the image + prompt = bounding_box # bounding box - input_boxes x1, y1, x2, y2 + training_data = {} + training_data['image'] = image + training_data["original_size"] = original_size + training_data["input_size"] = input_size + training_data["ground_truth_mask"] = mask_gt + training_data["prompt"] = prompt + return (training_data, mask_gt) #data, label + + +class INC_SAMVOC2012Dataloader: + def __init__(self, batch_size, **kwargs): + self.batch_size = batch_size + self.dataset = [] + ds = INC_SAMVOC2012Dataset(kwargs['voc_root'], kwargs['type']) + # operations to add (input_data, label) pairs into self.dataset + for i in range(len(ds)): + self.dataset.append(ds[i]) + + + def __iter__(self): + for input_data, label in self.dataset: + yield input_data, label \ No newline at end of file diff --git a/examples/pytorch/image_recognition/segment_anything/main.py b/examples/pytorch/image_recognition/segment_anything/main.py new file mode 100644 index 00000000000..275044cea3e --- /dev/null +++ b/examples/pytorch/image_recognition/segment_anything/main.py @@ -0,0 +1,349 @@ +from segment_anything import SamPredictor, sam_model_registry +import torchvision +import torch +from PIL import Image +import numpy as np +import os +from tqdm import tqdm +from statistics import mean +import torch +from torch import nn +from torch.nn.functional import threshold, normalize +import torch.nn.functional as F +from typing import List, Tuple +from copy import deepcopy +import torchmetrics +from segment_anything.modeling import ImageEncoderViT, MaskDecoder, PromptEncoder, TwoWayTransformer +from typing import Dict, Any +from neural_compressor import quantization, PostTrainingQuantConfig +from neural_compressor.config import TuningCriterion, AccuracyCriterion +from inc_dataset_loader import INC_SAMVOC2012Dataset +from neural_compressor.data import DataLoader +from neural_compressor.quantization import fit +from functools import partial +import argparse + +# Preprocessing codes are adapted from original SAM's implementation +# Ref: https://github.com/facebookresearch/segment-anything/blob/c1910835a32a05cbb79bdacbec8f25914a7e3a20/segment_anything/modeling/sam.py + +def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]: + scale = long_side_length * 1.0 / max(oldh, oldw) + newh, neww = oldh * scale, oldw * scale + neww = int(neww + 0.5) + newh = int(newh + 0.5) + return (newh, neww) + + +def apply_coords(coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: + target_length = 1024 + old_h, old_w = original_size + new_h, new_w = get_preprocess_shape(original_size[0], original_size[1], target_length) + coords = deepcopy(coords).astype(float) + coords[..., 0] = coords[..., 0] * (new_w / old_w) + coords[..., 1] = coords[..., 1] * (new_h / old_h) + return coords + + +def apply_boxes(boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray: + boxes = apply_coords(boxes.reshape(-1, 2, 2), original_size) + return boxes.reshape(-1, 4) + + +def postprocess_masks( + masks: torch.Tensor, + input_size: Tuple[int, ...], + original_size: Tuple[int, ...], +) -> List[torch.Tensor]: + image_encoder_img_size = 1024 + + masks = F.interpolate( + masks, + (image_encoder_img_size, image_encoder_img_size), + mode="bilinear", + align_corners=False, + ) + + unpadded_mask = masks[..., : input_size[0], : input_size[1]] + mask = F.interpolate(unpadded_mask, original_size, mode="bilinear", align_corners=False) + mask = mask[0] #Remove the unnecessary batch dimension + + return mask + + +class Sam_INC(nn.Module): + mask_threshold: float = 0.0 + image_format: str = "RGB" + + def __init__( + self, + pixel_mean: List[float] = [123.675, 116.28, 103.53], + pixel_std: List[float] = [58.395, 57.12, 57.375], + ) -> None: + super().__init__() + + # Moved from _build_sam() + # Specs for build_sam_vit_b + encoder_embed_dim=768 + encoder_depth=12 + encoder_num_heads=12 + encoder_global_attn_indexes=[2, 5, 8, 11] + + prompt_embed_dim = 256 + image_size = 1024 + vit_patch_size = 16 + image_embedding_size = image_size // vit_patch_size + + image_encoder=ImageEncoderViT( + depth=encoder_depth, + embed_dim=encoder_embed_dim, + img_size=image_size, + mlp_ratio=4, + norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), + num_heads=encoder_num_heads, + patch_size=vit_patch_size, + qkv_bias=True, + use_rel_pos=True, + global_attn_indexes=encoder_global_attn_indexes, + window_size=14, + out_chans=prompt_embed_dim, + ) + + prompt_encoder=PromptEncoder( + embed_dim=prompt_embed_dim, + image_embedding_size=(image_embedding_size, image_embedding_size), + input_image_size=(image_size, image_size), + mask_in_chans=16, + ) + + mask_decoder=MaskDecoder( + num_multimask_outputs=3, + transformer=TwoWayTransformer( + depth=2, + embedding_dim=prompt_embed_dim, + mlp_dim=2048, + num_heads=8, + ), + transformer_dim=prompt_embed_dim, + iou_head_depth=3, + iou_head_hidden_dim=256, + ) + + self.image_encoder = image_encoder + self.prompt_encoder = prompt_encoder + self.mask_decoder = mask_decoder + self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False) + self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False) + + @property + def device(self) -> Any: + return self.pixel_mean.device + + def forward( + self, + image, + prompt, + original_size, + input_size, + ground_truth_mask, + ): + + #Encode the images + if len(image.shape) == 3: + image = image[None, ...] # Append batch information for image_encoder + + image_embeddings = self.image_encoder(image) + + + input = np.zeros(4) + input[0] = prompt[0] + input[1] = prompt[1] + input[2] = prompt[2] + input[3] = prompt[3] + + original_size_tuple = (original_size[0].item(), original_size[1].item()) # H, W + transformed_boxes = apply_boxes(input, original_size_tuple) + transformed_boxes = torch.as_tensor(transformed_boxes, dtype=torch.float) + transformed_boxes = transformed_boxes[None, :] + + sparse_embeddings, dense_embeddings = self.prompt_encoder( + points=None, # Ignore point + boxes=transformed_boxes, #Take only 1 box as input + masks=None, # Ignore mask + ) + + low_res_masks, iou_predictions = self.mask_decoder( + image_embeddings=image_embeddings[0], + image_pe=self.prompt_encoder.get_dense_pe(), + sparse_prompt_embeddings=sparse_embeddings, + dense_prompt_embeddings=dense_embeddings, + multimask_output=False, + ) + + #Post process + masks = self.postprocess_masks( + low_res_masks, + input_size=input_size, #x['input_size'], + original_size=original_size_tuple, + ) + + masks = masks > self.mask_threshold + return masks[0].int() # Output pred for dataloader to compare + + def postprocess_masks( + self, + masks: torch.Tensor, + input_size: Tuple[int, ...], + original_size: Tuple[int, ...], + ) -> torch.Tensor: + + masks = F.interpolate( + masks, + (self.image_encoder.img_size, self.image_encoder.img_size), + mode="bilinear", + align_corners=False, + ) + masks = masks[..., : input_size[0], : input_size[1]] + masks = F.interpolate(masks, original_size, mode="bilinear", align_corners=False) + return masks + + def preprocess(self, x: torch.Tensor) -> torch.Tensor: + # Normalize colors + x = (x - self.pixel_mean) / self.pixel_std + + # Pad + h, w = x.shape[-2:] + padh = self.image_encoder.img_size - h + padw = self.image_encoder.img_size - w + x = F.pad(x, (0, padw, 0, padh)) + return x + + +def eval_func(model): + device = 'cpu' + metric = torchmetrics.Dice(ignore_index=0).to(device) #Ignore background + list_of_metrics = [] + + for i, (input, gt) in enumerate(eval_dataloader): + preds = model(input['image'], + input['prompt'], + input["original_size"], + input["input_size"], + input["ground_truth_mask"]) + labels = gt + result = metric(preds.reshape(-1), labels.reshape(-1).int()) + list_of_metrics.append(result) + + return np.array(list_of_metrics).mean() + + +def validate(eval_dataloader, model, args): + model.eval() + device = 'cpu' + metric = torchmetrics.Dice(ignore_index=0).to(device) + list_of_metrics = [] + + with torch.no_grad(): + for i, (input, gt) in enumerate(eval_dataloader): + preds = model(input['image'], + input['prompt'], + input["original_size"], + input["input_size"], + input["ground_truth_mask"]) + labels = gt + result = metric(preds.reshape(-1), labels.reshape(-1).int()) + list_of_metrics.append(result) + + print("Average Dice Score: " + str(np.array(list_of_metrics).mean()) ) + return + + +# Start PTQ +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='PyTorch VOC Training') + parser.add_argument("--pretrained_weight_location", default='./sam_vit_b_01ec64.pth', type=str, + help='Location of the image encoder pretrained weights') + parser.add_argument('--voc_dataset_location', default='./voc_dataset/VOCdevkit/VOC2012/', type=str, + help='Path of the VOC Dataset') + parser.add_argument("--tuned_checkpoint", default='./saved_results', type=str, metavar='PATH', + help='path to checkpoint tuned by Neural Compressor (default: ./)') + parser.add_argument("--tune", action='store_true', + help='Apply INT8 quantization or not') + parser.add_argument('--int8', action='store_true', + help='for benchmarking/validation using quantized model') + parser.add_argument('--accuracy', action='store_true', + help='For accuracy (dice score) measurements') + parser.add_argument("--iter", default=0, type=int, + help='For dice measurement only.') + parser.add_argument("--performance", action='store_true', + help='For benchmaking') + + args = parser.parse_args() + + # Prepare the model + model = Sam_INC() + model.load_state_dict(torch.load(args.pretrained_weight_location)) + train_inc_dataset = INC_SAMVOC2012Dataset(args.voc_dataset_location, 'train') + eval_inc_dataset = INC_SAMVOC2012Dataset(args.voc_dataset_location, 'val') + calib_dataloader = DataLoader(framework="pytorch", dataset=train_inc_dataset) + eval_dataloader = DataLoader(framework="pytorch", dataset=eval_inc_dataset) + + # quantization + if args.tune: + op_type_dict={ + 'Embedding': + { + 'weight': {'dtype': 'fp32'}, + 'activation': {'dtype': 'fp32'}, + }, + 'ConvTranspose2d': + { + 'weight': {'dtype': 'fp32'}, + 'activation': {'dtype': 'fp32'}, + }, + 'Conv2d': + { + 'weight': {'dtype': 'int8'}, + 'activation': {'dtype': 'int8'}, + }, + 'Linear': + { + 'weight': {'dtype': 'int8'}, + 'activation': {'dtype': 'int8'}, + }, + 'LinearReLU': + { + 'weight': {'dtype': 'int8'}, + 'activation': {'dtype': 'int8'}, + }, + 'LayerNorm': + { + 'weight': {'dtype': 'fp32'}, + 'activation': {'dtype': 'fp32'}, + }, + } + + accuracy_criterion=AccuracyCriterion(tolerable_loss=0.05) + tuning_criterion=TuningCriterion(timeout=0, max_trials=1) + config = PostTrainingQuantConfig(op_type_dict=op_type_dict, tuning_criterion=tuning_criterion, accuracy_criterion=accuracy_criterion) + config.use_bf16=False + + q_model = fit(model, config, calib_dataloader=calib_dataloader, eval_func=eval_func) + q_model.save(args.tuned_checkpoint) + + # benchmark/evaluation + if args.int8: + from neural_compressor.utils.pytorch import load + new_model = load(os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model, dataloader=eval_dataloader) + else: + new_model = model + + if args.performance: + from neural_compressor.config import BenchmarkConfig + from neural_compressor import benchmark + b_conf = BenchmarkConfig(warmup=5, + iteration=args.iter, + cores_per_instance=52, + num_of_instance=1) + benchmark.fit(new_model, b_conf, b_dataloader=eval_dataloader) + if args.accuracy: + validate(eval_dataloader, new_model, args) diff --git a/examples/pytorch/image_recognition/segment_anything/requirements.txt b/examples/pytorch/image_recognition/segment_anything/requirements.txt new file mode 100644 index 00000000000..c16e8d9b31f --- /dev/null +++ b/examples/pytorch/image_recognition/segment_anything/requirements.txt @@ -0,0 +1,4 @@ +torchvision +tdqm +torchmetrics +neural-compressor \ No newline at end of file diff --git a/examples/pytorch/image_recognition/segment_anything/run_benchmark.sh b/examples/pytorch/image_recognition/segment_anything/run_benchmark.sh new file mode 100644 index 00000000000..69e23e594c6 --- /dev/null +++ b/examples/pytorch/image_recognition/segment_anything/run_benchmark.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_benchmark + +} + +# init params +function init_params { + iters=100 + tuned_checkpoint=./saved_results + voc_dataset_location=./voc_dataset/VOCdevkit/VOC2012/ + for var in "$@" + do + case $var in + --tuned_checkpoint=*) + tuned_checkpoint=$(echo $var |cut -f2 -d=) + ;; + --voc_dataset_location=*) + voc_dataset_location=$(echo $var |cut -f2 -d=) + ;; + --mode=*) + mode=$(echo $var |cut -f2 -d=) + ;; + --iters=*) + iters=$(echo ${var} |cut -f2 -d=) + ;; + --int8=*) + int8=$(echo ${var} |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + + +# run_benchmark +function run_benchmark { + if [[ ${mode} == "accuracy" ]]; then + mode_cmd=" --accuracy" + elif [[ ${mode} == "performance" ]]; then + mode_cmd=" --iter ${iters} --performance" + else + echo "Error: No such mode: ${mode}" + exit 1 + fi + + if [[ ${int8} == "True" ]]; then + extra_cmd=" --int8 --voc_dataset_location ${voc_dataset_location}" + else + extra_cmd=" --voc_dataset_location ${voc_dataset_location} " + fi + + python main.py \ + --tuned_checkpoint ${tuned_checkpoint} \ + ${mode_cmd} \ + ${extra_cmd} +} + +main "$@" \ No newline at end of file diff --git a/examples/pytorch/image_recognition/segment_anything/run_quant.sh b/examples/pytorch/image_recognition/segment_anything/run_quant.sh new file mode 100644 index 00000000000..ba4edba22c9 --- /dev/null +++ b/examples/pytorch/image_recognition/segment_anything/run_quant.sh @@ -0,0 +1,46 @@ +#!/bin/bash +set -x + +function main { + + init_params "$@" + run_tuning + +} + +# init params +function init_params { + output_model=saved_results + pretrained_weight_location=./sam_vit_b_01ec64.pth + voc_dataset_location=./voc_dataset/VOCdevkit/VOC2012/ + for var in "$@" + do + case $var in + --voc_dataset_location=*) + voc_dataset_location=$(echo $var |cut -f2 -d=) + ;; + --pretrained_weight_location=*) + pretrained_weight_location=$(echo $var |cut -f2 -d=) + ;; + --output_model=*) + output_model=$(echo $var |cut -f2 -d=) + ;; + *) + echo "Error: No such parameter: ${var}" + exit 1 + ;; + esac + done + +} + +# run_tuning +function run_tuning { + python main.py \ + --pretrained_weight_location ${pretrained_weight_location} \ + --tuned_checkpoint ${output_model} \ + --voc_dataset_location ${voc_dataset_location} \ + --tune +} + +main "$@" \ No newline at end of file