diff --git a/.gitignore b/.gitignore
index 223e87a19..d52f92b83 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,10 @@ dist/
 # Pycharm editor settings
 .idea
 
+# MacOS
+.DS_Store
+
 # project dirs
 /datasets
 /models
+/output
diff --git a/INSTALL.md b/INSTALL.md
index 8365be8f9..b1bfaa293 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -7,7 +7,7 @@
 - yacs
 - matplotlib
 - GCC >= 4.9
-- (optional) OpenCV for the webcam demo
+- OpenCV
 
 
 ### Option 1: Step-by-step installation
@@ -24,7 +24,7 @@ conda activate maskrcnn_benchmark
 conda install ipython
 
 # maskrcnn_benchmark and coco api dependencies
-pip install ninja yacs cython matplotlib tqdm
+pip install ninja yacs cython matplotlib tqdm opencv-python
 
 # follow PyTorch installation in https://pytorch.org/get-started/locally/
 # we give the instructions for CUDA 9.0
@@ -38,6 +38,12 @@ git clone https://github.com/cocodataset/cocoapi.git
 cd cocoapi/PythonAPI
 python setup.py build_ext install
 
+# install apex
+cd $INSTALL_DIR
+git clone https://github.com/NVIDIA/apex.git
+cd apex
+python setup.py install --cuda_ext --cpp_ext
+
 # install PyTorch Detection
 cd $INSTALL_DIR
 git clone https://github.com/facebookresearch/maskrcnn-benchmark.git
@@ -55,16 +61,71 @@ unset INSTALL_DIR
 # or if you are on macOS
 # MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py build develop
 ```
+#### Windows 10 
+```bash
+open a cmd and change to desired installation directory 
+from now on will be refered as INSTALL_DIR 
+conda create --name maskrcnn_benchmark
+conda activate maskrcnn_benchmark
+
+# this installs the right pip and dependencies for the fresh python
+conda install ipython
+
+# maskrcnn_benchmark and coco api dependencies
+pip install ninja yacs cython matplotlib tqdm opencv-python
+
+# follow PyTorch installation in https://pytorch.org/get-started/locally/
+# we give the instructions for CUDA 9.0
+## Important : check the cuda version installed on your computer by running the command in the cmd : 
+nvcc -- version 
+conda install -c pytorch pytorch-nightly torchvision cudatoolkit=9.0
+
+git clone https://github.com/cocodataset/cocoapi.git
+    
+    #To prevent installation error do the following after commiting cocooapi : 
+    #using file explorer  naviagate to cocoapi\PythonAPI\setup.py and change line 14 from:
+    #extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99'],
+    #to
+    #extra_compile_args={'gcc': ['/Qstd=c99']},
+    #Based on  https://github.com/cocodataset/cocoapi/issues/51
 
+cd cocoapi/PythonAPI
+python setup.py build_ext install
+
+# navigate back to INSTALL_DIR
+cd ..
+cd .. 
+# install apex
+
+git clone https://github.com/NVIDIA/apex.git
+cd apex
+python setup.py install --cuda_ext --cpp_ext
+# navigate back to INSTALL_DIR
+cd .. 
+# install PyTorch Detection
+
+git clone https://github.com/Idolized22/maskrcnn-benchmark.git
+cd maskrcnn-benchmark
+
+# the following will install the lib with
+# symbolic links, so that you can modify
+# the files if you want and won't need to
+# re-build it
+python setup.py build develop
+```
 ### Option 2: Docker Image (Requires CUDA, Linux only)
 
-Build image with defaults (`CUDA=9.0`, `CUDNN=7`):
+Build image with defaults (`CUDA=9.0`, `CUDNN=7`, `FORCE_CUDA=1`):
 
     nvidia-docker build -t maskrcnn-benchmark docker/
     
 Build image with other CUDA and CUDNN versions:
 
-    nvidia-docker build -t maskrcnn-benchmark --build-arg CUDA=9.2 --build-arg CUDNN=7 docker/ 
+    nvidia-docker build -t maskrcnn-benchmark --build-arg CUDA=9.2 --build-arg CUDNN=7 docker/
+    
+Build image with FORCE_CUDA disabled:
+
+    nvidia-docker build -t maskrcnn-benchmark --build-arg FORCE_CUDA=0 docker/
     
 Build and run image with built-in jupyter notebook(note that the password is used to log in jupyter notebook):
 
diff --git a/README.md b/README.md
index 01a81d812..baa7c6bc4 100644
--- a/README.md
+++ b/README.md
@@ -129,7 +129,7 @@ you'll also need to change the learning rate, the number of iterations and the l
 
 Here is an example for Mask R-CNN R-50 FPN with the 1x schedule:
 ```bash
-python tools/train_net.py --config-file "configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025 SOLVER.MAX_ITER 720000 SOLVER.STEPS "(480000, 640000)" TEST.IMS_PER_BATCH 1
+python tools/train_net.py --config-file "configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025 SOLVER.MAX_ITER 720000 SOLVER.STEPS "(480000, 640000)" TEST.IMS_PER_BATCH 1 MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN 2000
 ```
 This follows the [scheduling rules from Detectron.](https://github.com/facebookresearch/Detectron/blob/master/configs/getting_started/tutorial_1gpu_e2e_faster_rcnn_R-50-FPN.yaml#L14-L30)
 Note that we have multiplied the number of iterations by 8x (as well as the learning rate schedules),
@@ -138,6 +138,7 @@ and we have divided the learning rate by 8x.
 We also changed the batch size during testing, but that is generally not necessary because testing
 requires much less memory than training.
 
+Furthermore, we set `MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN 2000` as the proposals are selected for per the batch rather than per image in the default training. The value is calculated by **1000 x images-per-gpu**. Here we have 2 images per GPU, therefore we set the number as 1000 x 2 = 2000. If we have 8 images per GPU, the value should be set as 8000. Note that this does not apply if `MODEL.RPN.FPN_POST_NMS_PER_BATCH` is set to `False` during training. See [#672](https://github.com/facebookresearch/maskrcnn-benchmark/issues/672) for more details.
 
 ### Multi-GPU training
 We use internally `torch.distributed.launch` in order to launch
@@ -147,8 +148,17 @@ process will only use a single GPU.
 
 ```bash
 export NGPUS=8
-python -m torch.distributed.launch --nproc_per_node=$NGPUS /path_to_maskrcnn_benchmark/tools/train_net.py --config-file "path/to/config/file.yaml"
+python -m torch.distributed.launch --nproc_per_node=$NGPUS /path_to_maskrcnn_benchmark/tools/train_net.py --config-file "path/to/config/file.yaml" MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN images_per_gpu x 1000
 ```
+Note we should set `MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN` follow the rule in Single-GPU training.
+
+## Evaluation
+You can test your model directly on single or multiple gpus. Here is an example for Mask R-CNN R-50 FPN with the 1x schedule on 8 GPUS:
+```bash
+export NGPUS=8
+python -m torch.distributed.launch --nproc_per_node=$NGPUS /path_to_maskrcnn_benchmark/tools/test_net.py --config-file "configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" TEST.IMS_PER_BATCH 16
+```
+To calculate mAP for each class, you can simply modify a few lines in [coco_eval.py](https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/data/datasets/evaluation/coco/coco_eval.py). See [#524](https://github.com/facebookresearch/maskrcnn-benchmark/issues/524#issuecomment-475118810) for more details.
 
 ## Abstractions
 For more information on some of the main abstractions in our implementation, see [ABSTRACTIONS.md](ABSTRACTIONS.md).
@@ -198,11 +208,21 @@ That's it. You can also add extra fields to the boxlist, such as segmentation ma
 
 For a full example of how the `COCODataset` is implemented, check [`maskrcnn_benchmark/data/datasets/coco.py`](maskrcnn_benchmark/data/datasets/coco.py).
 
-### Note:
+Once you have created your dataset, it needs to be added in a couple of places:
+- [`maskrcnn_benchmark/data/datasets/__init__.py`](maskrcnn_benchmark/data/datasets/__init__.py): add it to `__all__`
+- [`maskrcnn_benchmark/config/paths_catalog.py`](maskrcnn_benchmark/config/paths_catalog.py): `DatasetCatalog.DATASETS` and corresponding `if` clause in `DatasetCatalog.get()`
+
+### Testing
 While the aforementioned example should work for training, we leverage the
 cocoApi for computing the accuracies during testing. Thus, test datasets
 should currently follow the cocoApi for now.
 
+To enable your dataset for testing, add a corresponding if statement in [`maskrcnn_benchmark/data/datasets/evaluation/__init__.py`](maskrcnn_benchmark/data/datasets/evaluation/__init__.py):
+```python
+if isinstance(dataset, datasets.MyDataset):
+        return coco_evaluation(**args)
+```
+
 ## Finetuning from Detectron weights on custom datasets
 Create a script `tools/trim_detectron_model.py` like [here](https://gist.github.com/wangg12/aea194aa6ab6a4de088f14ee193fd968).
 You can decide which keys to be removed and which keys to be kept by modifying the script.
@@ -233,8 +253,9 @@ note = {Accessed: [Insert date here]}
 - [RetinaMask: Learning to predict masks improves state-of-the-art single-shot detection for free](https://arxiv.org/abs/1901.03353). 
   Cheng-Yang Fu, Mykhailo Shvets, and Alexander C. Berg.
   Tech report, arXiv,1901.03353.
-
-
+- [FCOS: Fully Convolutional One-Stage Object Detection](https://arxiv.org/abs/1904.01355).
+  Zhi Tian, Chunhua Shen, Hao Chen and Tong He.
+  Tech report, arXiv,1904.01355. [[code](https://github.com/tianzhi0549/FCOS)]
 
 ## License
 
diff --git a/configs/cityscapes/README.md b/configs/cityscapes/README.md
new file mode 100644
index 000000000..b8595f79c
--- /dev/null
+++ b/configs/cityscapes/README.md
@@ -0,0 +1,217 @@
+### Paper 
+1 [mask-rcnn](https://arxiv.org/pdf/1703.06870.pdf)  
+
+
+### dataset
+1 [cityscapesScripts](https://github.com/mcordts/cityscapesScripts)  
+
+
+### Performance (from paper)
+|      case    | training data | im/gpu | mask AP[val] | mask AP [test] | mask AP50 [test] |
+|--------------|:-------------:|:------:|:------------:|:--------------:|-----------------:|
+|   R-50-FPN   | fine          |   8/8  |    31.5      | 26.2           | 49.9             |
+|   R-50-FPN   | fine + COCO   |   8/8  |    36.4      | 32.0           | 58.1             |
+
+
+### Note (from paper)
+We apply our Mask R-CNN models with the ResNet-FPN-50 backbone; we found the 101-layer counterpart performs similarly due to the small dataset size. We train with image scale (shorter side) randomly sampled from [800, 1024], which reduces overfitting; inference is on a single scale of 1024 pixels. We use a mini-batch size of 1 image per GPU (so 8 on 8 GPUs) and train the model for 24k iterations, starting from a learning rate of 0.01 and reducing it to 0.001 at 18k iterations. It takes ∼4 hours of training on a single 8-GPU machine under this setting.  
+
+
+### Implemetation (for finetuning from coco trained model)
+Step 1: download trained model on coco dataset from [model zoo](https://download.pytorch.org/models/maskrcnn/e2e_mask_rcnn_R_50_FPN_1x.pth)  
+Step 2: do the model surgery on the trained model as below and use it as `pretrained model` for finetuning:    
+```python
+def clip_weights_from_pretrain_of_coco_to_cityscapes(f, out_file):
+	""""""
+	# COCO categories for pretty print
+	COCO_CATEGORIES = [
+	    "__background__",
+	    "person",
+	    "bicycle",
+	    "car",
+	    "motorcycle",
+	    "airplane",
+	    "bus",
+	    "train",
+	    "truck",
+	    "boat",
+	    "traffic light",
+	    "fire hydrant",
+	    "stop sign",
+	    "parking meter",
+	    "bench",
+	    "bird",
+	    "cat",
+	    "dog",
+	    "horse",
+	    "sheep",
+	    "cow",
+	    "elephant",
+	    "bear",
+	    "zebra",
+	    "giraffe",
+	    "backpack",
+	    "umbrella",
+	    "handbag",
+	    "tie",
+	    "suitcase",
+	    "frisbee",
+	    "skis",
+	    "snowboard",
+	    "sports ball",
+	    "kite",
+	    "baseball bat",
+	    "baseball glove",
+	    "skateboard",
+	    "surfboard",
+	    "tennis racket",
+	    "bottle",
+	    "wine glass",
+	    "cup",
+	    "fork",
+	    "knife",
+	    "spoon",
+	    "bowl",
+	    "banana",
+	    "apple",
+	    "sandwich",
+	    "orange",
+	    "broccoli",
+	    "carrot",
+	    "hot dog",
+	    "pizza",
+	    "donut",
+	    "cake",
+	    "chair",
+	    "couch",
+	    "potted plant",
+	    "bed",
+	    "dining table",
+	    "toilet",
+	    "tv",
+	    "laptop",
+	    "mouse",
+	    "remote",
+	    "keyboard",
+	    "cell phone",
+	    "microwave",
+	    "oven",
+	    "toaster",
+	    "sink",
+	    "refrigerator",
+	    "book",
+	    "clock",
+	    "vase",
+	    "scissors",
+	    "teddy bear",
+	    "hair drier",
+	    "toothbrush",
+	]
+	# Cityscapes of fine categories for pretty print
+	CITYSCAPES_FINE_CATEGORIES = [
+	    "__background__",
+	    "person",
+	    "rider",
+	    "car",
+	    "truck",
+	    "bus",
+	    "train",
+	    "motorcycle",
+	    "bicycle",
+	]
+	coco_cats = COCO_CATEGORIES
+	cityscapes_cats = CITYSCAPES_FINE_CATEGORIES
+	coco_cats_to_inds = dict(zip(coco_cats, range(len(coco_cats))))
+	cityscapes_cats_to_inds = dict(
+		zip(cityscapes_cats, range(len(cityscapes_cats)))
+	)
+
+	checkpoint = torch.load(f)
+	m = checkpoint['model']
+
+	weight_names = {
+		"cls_score": "module.roi_heads.box.predictor.cls_score.weight", 
+		"bbox_pred": "module.roi_heads.box.predictor.bbox_pred.weight", 
+		"mask_fcn_logits": "module.roi_heads.mask.predictor.mask_fcn_logits.weight", 
+	}
+	bias_names = {
+		"cls_score": "module.roi_heads.box.predictor.cls_score.bias",
+		"bbox_pred": "module.roi_heads.box.predictor.bbox_pred.bias", 
+		"mask_fcn_logits": "module.roi_heads.mask.predictor.mask_fcn_logits.bias",
+	}
+	
+	representation_size = m[weight_names["cls_score"]].size(1)
+	cls_score = nn.Linear(representation_size, len(cityscapes_cats))
+	nn.init.normal_(cls_score.weight, std=0.01)
+	nn.init.constant_(cls_score.bias, 0)
+
+	representation_size = m[weight_names["bbox_pred"]].size(1)
+	class_agnostic = m[weight_names["bbox_pred"]].size(0) != len(coco_cats) * 4
+	num_bbox_reg_classes = 2 if class_agnostic else len(cityscapes_cats)
+	bbox_pred = nn.Linear(representation_size, num_bbox_reg_classes * 4)
+	nn.init.normal_(bbox_pred.weight, std=0.001)
+	nn.init.constant_(bbox_pred.bias, 0)
+
+	dim_reduced = m[weight_names["mask_fcn_logits"]].size(1)
+	mask_fcn_logits = Conv2d(dim_reduced, len(cityscapes_cats), 1, 1, 0)
+	nn.init.constant_(mask_fcn_logits.bias, 0)
+	nn.init.kaiming_normal_(
+		mask_fcn_logits.weight, mode="fan_out", nonlinearity="relu"
+	)
+	
+	def _copy_weight(src_weight, dst_weight):
+		for ix, cat in enumerate(cityscapes_cats):
+			if cat not in coco_cats:
+				continue
+			jx = coco_cats_to_inds[cat]
+			dst_weight[ix] = src_weight[jx]
+		return dst_weight
+
+	def _copy_bias(src_bias, dst_bias, class_agnostic=False):
+		if class_agnostic:
+			return dst_bias
+		return _copy_weight(src_bias, dst_bias)
+
+	m[weight_names["cls_score"]] = _copy_weight(
+		m[weight_names["cls_score"]], cls_score.weight
+	)
+	m[weight_names["bbox_pred"]] = _copy_weight(
+		m[weight_names["bbox_pred"]], bbox_pred.weight
+	)
+	m[weight_names["mask_fcn_logits"]] = _copy_weight(
+		m[weight_names["mask_fcn_logits"]], mask_fcn_logits.weight
+	)
+
+	m[bias_names["cls_score"]] = _copy_bias(
+		m[bias_names["cls_score"]], cls_score.bias
+	)
+	m[bias_names["bbox_pred"]] = _copy_bias(
+		m[bias_names["bbox_pred"]], bbox_pred.bias, class_agnostic
+	)
+	m[bias_names["mask_fcn_logits"]] = _copy_bias(
+		m[bias_names["mask_fcn_logits"]], mask_fcn_logits.bias
+	)
+
+	print("f: {}\nout_file: {}".format(f, out_file))
+	torch.save(m, out_file)
+```
+Step 3: modify the `input&weight&solver` configuration in the `yaml` file, like this:  
+```
+MODEL:
+  WEIGHT: "xxx.pth" # the model u save from above code
+
+INPUT:
+  MIN_SIZE_TRAIN: (800, 832, 864, 896, 928, 960, 992, 1024, 1024)
+  MAX_SIZE_TRAIN: 2048
+  MIN_SIZE_TEST: 1024
+  MAX_SIZE_TEST: 2048
+
+SOLVER:
+  BASE_LR: 0.01
+  IMS_PER_BATCH: 8
+  WEIGHT_DECAY: 0.0001
+  STEPS: (3000,)
+  MAX_ITER: 4000
+```
+Step 4: train the model.  
+
diff --git a/configs/dcn/README.md b/configs/dcn/README.md
new file mode 100644
index 000000000..2c3a0784e
--- /dev/null
+++ b/configs/dcn/README.md
@@ -0,0 +1,31 @@
+### Reference 
+1 [Deformable ConvNets v2: More Deformable, Better Results](https://arxiv.org/pdf/1811.11168.pdf)  
+2 third-party: [mmdetection](https://github.com/open-mmlab/mmdetection/tree/master/configs/dcn)  
+
+### Performance
+|      case                   | bbox AP | mask AP |
+|----------------------------:|--------:|:-------:|
+| R-50-FPN-dcn (implement)    |  39.8   |  -      |
+| R-50-FPN-dcn (mmdetection)  |  40.0   |  -      |
+| R-50-FPN-mdcn (implement)   |  40.0   |  -      |
+| R-50-FPN-mdcn (mmdetection) |  40.3   |  -      |
+| R-50-FPN-dcn (implement)    |  40.8   |  36.8   |
+| R-50-FPN-dcn (mmdetection)  |  41.1   |  37.2   |
+| R-50-FPN-dcn (implement)    |  40.7   |  36.7   |
+| R-50-FPN-dcn (mmdetection)  |  41.4   |  37.4   |
+
+
+### Note
+see [dcn-v2](https://github.com/open-mmlab/mmdetection/blob/master/MODEL_ZOO.md#deformable-convolution-v2) in `mmdetection` for more details.  
+
+
+### Usage
+add these three lines
+```
+MODEL:
+	RESNETS:
+		# corresponding to C2,C3,C4,C5
+		STAGE_WITH_DCN: (False, True, True, True)
+		WITH_MODULATED_DCN: True
+		DEFORMABLE_GROUPS: 1
+```
\ No newline at end of file
diff --git a/configs/dcn/e2e_faster_rcnn_dconv_R_50_FPN_1x.yaml b/configs/dcn/e2e_faster_rcnn_dconv_R_50_FPN_1x.yaml
new file mode 100644
index 000000000..25c037592
--- /dev/null
+++ b/configs/dcn/e2e_faster_rcnn_dconv_R_50_FPN_1x.yaml
@@ -0,0 +1,44 @@
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+  RESNETS:
+    BACKBONE_OUT_CHANNELS: 256
+    STAGE_WITH_DCN: (False, True, True, True)
+    WITH_MODULATED_DCN: False
+    DEFORMABLE_GROUPS: 1
+  RPN:
+    USE_FPN: True
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+DATASETS:
+  TRAIN: ("coco_2014_train", "coco_2014_valminusminival")
+  TEST: ("coco_2014_minival",)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 8 gpus
+  BASE_LR: 0.02
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 16
+TEST:
+  IMS_PER_BATCH: 8
diff --git a/configs/dcn/e2e_faster_rcnn_mdconv_R_50_FPN_1x.yaml b/configs/dcn/e2e_faster_rcnn_mdconv_R_50_FPN_1x.yaml
new file mode 100644
index 000000000..6bc04212c
--- /dev/null
+++ b/configs/dcn/e2e_faster_rcnn_mdconv_R_50_FPN_1x.yaml
@@ -0,0 +1,44 @@
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+  RESNETS:
+    BACKBONE_OUT_CHANNELS: 256
+    STAGE_WITH_DCN: (False, True, True, True)
+    WITH_MODULATED_DCN: True
+    DEFORMABLE_GROUPS: 1
+  RPN:
+    USE_FPN: True
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+DATASETS:
+  TRAIN: ("coco_2014_train", "coco_2014_valminusminival")
+  TEST: ("coco_2014_minival",)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 8 gpus
+  BASE_LR: 0.02
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 16
+TEST:
+  IMS_PER_BATCH: 8
diff --git a/configs/dcn/e2e_mask_rcnn_dconv_R_50_FPN_1x.yaml b/configs/dcn/e2e_mask_rcnn_dconv_R_50_FPN_1x.yaml
new file mode 100644
index 000000000..5cffe5c57
--- /dev/null
+++ b/configs/dcn/e2e_mask_rcnn_dconv_R_50_FPN_1x.yaml
@@ -0,0 +1,54 @@
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+  RESNETS:
+    BACKBONE_OUT_CHANNELS: 256
+    STAGE_WITH_DCN: (False, True, True, True)
+    WITH_MODULATED_DCN: False
+    DEFORMABLE_GROUPS: 1
+  RPN:
+    USE_FPN: True
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2014_train", "coco_2014_valminusminival")
+  TEST: ("coco_2014_minival",)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 8 gpus
+  BASE_LR: 0.02
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 16
+TEST:
+  IMS_PER_BATCH: 8
+
diff --git a/configs/dcn/e2e_mask_rcnn_mdconv_R_50_FPN_1x.yaml b/configs/dcn/e2e_mask_rcnn_mdconv_R_50_FPN_1x.yaml
new file mode 100644
index 000000000..9921adfa5
--- /dev/null
+++ b/configs/dcn/e2e_mask_rcnn_mdconv_R_50_FPN_1x.yaml
@@ -0,0 +1,53 @@
+INPUT:
+  MIN_SIZE_TRAIN: (800,)
+  MAX_SIZE_TRAIN: 1333
+  MIN_SIZE_TEST: 800
+  MAX_SIZE_TEST: 1333
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+  RESNETS:
+    BACKBONE_OUT_CHANNELS: 256
+    STAGE_WITH_DCN: (False, True, True, True)
+    WITH_MODULATED_DCN: True
+    DEFORMABLE_GROUPS: 1
+  RPN:
+    USE_FPN: True
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2014_train", "coco_2014_valminusminival")
+  TEST: ("coco_2014_minival",)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  # Assume 8 gpus
+  BASE_LR: 0.02
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+  IMS_PER_BATCH: 16
+TEST:
+  IMS_PER_BATCH: 8
diff --git a/configs/test_time_aug/e2e_mask_rcnn_R_50_FPN_1x.yaml b/configs/test_time_aug/e2e_mask_rcnn_R_50_FPN_1x.yaml
new file mode 100644
index 000000000..d1e4a75b6
--- /dev/null
+++ b/configs/test_time_aug/e2e_mask_rcnn_R_50_FPN_1x.yaml
@@ -0,0 +1,48 @@
+MODEL:
+  META_ARCHITECTURE: "GeneralizedRCNN"
+  WEIGHT: "catalog://ImageNetPretrained/MSRA/R-50"
+  BACKBONE:
+    CONV_BODY: "R-50-FPN"
+  RESNETS:
+    BACKBONE_OUT_CHANNELS: 256
+  RPN:
+    USE_FPN: True
+    ANCHOR_STRIDE: (4, 8, 16, 32, 64)
+    PRE_NMS_TOP_N_TRAIN: 2000
+    PRE_NMS_TOP_N_TEST: 1000
+    POST_NMS_TOP_N_TEST: 1000
+    FPN_POST_NMS_TOP_N_TEST: 1000
+  ROI_HEADS:
+    USE_FPN: True
+  ROI_BOX_HEAD:
+    POOLER_RESOLUTION: 7
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    POOLER_SAMPLING_RATIO: 2
+    FEATURE_EXTRACTOR: "FPN2MLPFeatureExtractor"
+    PREDICTOR: "FPNPredictor"
+  ROI_MASK_HEAD:
+    POOLER_SCALES: (0.25, 0.125, 0.0625, 0.03125)
+    FEATURE_EXTRACTOR: "MaskRCNNFPNFeatureExtractor"
+    PREDICTOR: "MaskRCNNC4Predictor"
+    POOLER_RESOLUTION: 14
+    POOLER_SAMPLING_RATIO: 2
+    RESOLUTION: 28
+    SHARE_BOX_FEATURE_EXTRACTOR: False
+  MASK_ON: True
+DATASETS:
+  TRAIN: ("coco_2014_train", "coco_2014_valminusminival")
+  TEST: ("coco_2014_minival",)
+DATALOADER:
+  SIZE_DIVISIBILITY: 32
+SOLVER:
+  BASE_LR: 0.02
+  WEIGHT_DECAY: 0.0001
+  STEPS: (60000, 80000)
+  MAX_ITER: 90000
+TEST:
+  BBOX_AUG:
+    ENABLED: True
+    H_FLIP: True
+    SCALES: (400, 500, 600, 700, 900, 1000, 1100, 1200)
+    MAX_SIZE: 2000
+    SCALE_H_FLIP: True
diff --git a/demo/README.md b/demo/README.md
index 393a064b0..5926f8d35 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -38,7 +38,8 @@ docker run --rm -it \
     -v /tmp/.X11-unix:/tmp/.X11-unix \
     --device=/dev/video0:/dev/video0 \
     --ipc=host maskrcnn-benchmark \
-    python demo/webcam.py --min-image-size 300
+    python demo/webcam.py --min-image-size 300 \
+    --config-file configs/caffe2/e2e_mask_rcnn_R_50_FPN_1x_caffe2.yaml
 ```
 
 **DISCLAIMER:** *This was tested for an Ubuntu 16.04 machine, 
diff --git a/demo/predictor.py b/demo/predictor.py
index b152fda8d..fa663c7e4 100644
--- a/demo/predictor.py
+++ b/demo/predictor.py
@@ -2,7 +2,7 @@
 import cv2
 import torch
 from torchvision import transforms as T
-
+from torchvision.transforms import functional as F
 from maskrcnn_benchmark.modeling.detector import build_detection_model
 from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
 from maskrcnn_benchmark.structures.image_list import to_image_list
@@ -10,7 +10,38 @@
 from maskrcnn_benchmark import layers as L
 from maskrcnn_benchmark.utils import cv2_util
 
+class Resize(object):
+    def __init__(self, min_size, max_size):
+        self.min_size = min_size
+        self.max_size = max_size
+
+    # modified from torchvision to add support for max size
+    def get_size(self, image_size):
+        w, h = image_size
+        size = self.min_size
+        max_size = self.max_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        return (oh, ow)
 
+    def __call__(self, image):
+        size = self.get_size(image.size)
+        image = F.resize(image, size)
+        return image
 class COCODemo(object):
     # COCO categories for pretty print
     CATEGORIES = [
@@ -147,11 +178,12 @@ def build_transform(self):
         normalize_transform = T.Normalize(
             mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD
         )
-
+        min_size = cfg.INPUT.MIN_SIZE_TEST
+        max_size = cfg.INPUT.MAX_SIZE_TEST
         transform = T.Compose(
             [
                 T.ToPILImage(),
-                T.Resize(self.min_image_size),
+                Resize(min_size, max_size),
                 T.ToTensor(),
                 to_bgr_transform,
                 normalize_transform,
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 39b508258..762441fe2 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -29,10 +29,11 @@ ENV PATH=$CONDA_PREFIX/bin:$PATH
 ENV CONDA_AUTO_UPDATE_CONDA=false
 
 RUN conda install -y ipython
-RUN pip install ninja yacs cython matplotlib opencv-python
+RUN pip install requests ninja yacs cython matplotlib opencv-python tqdm
 
-# Install PyTorch 1.0 Nightly and OpenCV
-RUN conda install -y pytorch-nightly -c pytorch \
+# Install PyTorch 1.0 Nightly
+ARG CUDA
+RUN conda install pytorch-nightly cudatoolkit=${CUDA} -c pytorch \
  && conda clean -ya
 
 # Install TorchVision master
@@ -45,7 +46,14 @@ RUN git clone https://github.com/cocodataset/cocoapi.git \
  && cd cocoapi/PythonAPI \
  && python setup.py build_ext install
 
+# install apex
+RUN git clone https://github.com/NVIDIA/apex.git \
+ && cd apex \
+ && python setup.py install --cuda_ext --cpp_ext
+
 # install PyTorch Detection
+ARG FORCE_CUDA="1"
+ENV FORCE_CUDA=${FORCE_CUDA}
 RUN git clone https://github.com/facebookresearch/maskrcnn-benchmark.git \
  && cd maskrcnn-benchmark \
  && python setup.py build develop
diff --git a/docker/docker-jupyter/Dockerfile b/docker/docker-jupyter/Dockerfile
index 323727195..bf957b636 100644
--- a/docker/docker-jupyter/Dockerfile
+++ b/docker/docker-jupyter/Dockerfile
@@ -28,7 +28,7 @@ ENV PATH=$CONDA_PREFIX/bin:$PATH
 ENV CONDA_AUTO_UPDATE_CONDA=false
 
 RUN conda install -y ipython
-RUN pip install ninja yacs cython matplotlib jupyter
+RUN pip install requests ninja yacs cython matplotlib jupyter
 
 # Install PyTorch 1.0 Nightly and OpenCV
 RUN conda install -y pytorch-nightly -c pytorch \
diff --git a/maskrcnn_benchmark/config/defaults.py b/maskrcnn_benchmark/config/defaults.py
index fc750fd4f..beae4070a 100644
--- a/maskrcnn_benchmark/config/defaults.py
+++ b/maskrcnn_benchmark/config/defaults.py
@@ -10,9 +10,9 @@
 # Whenever an argument can be either used for training or for testing, the
 # corresponding name will be post-fixed by a _TRAIN for a training parameter,
 # or _TEST for a test-specific parameter.
-# For example, the number of images during training will be
-# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be
-# IMAGES_PER_BATCH_TEST
+# For example, the maximum image side during training will be
+# INPUT.MAX_SIZE_TRAIN, while for testing it will be
+# INPUT.MAX_SIZE_TEST
 
 # -----------------------------------------------------------------------------
 # Config definition
@@ -54,6 +54,12 @@
 # Convert image to BGR format (for Caffe2 models), in range 0-255
 _C.INPUT.TO_BGR255 = True
 
+# Image ColorJitter
+_C.INPUT.BRIGHTNESS = 0.0
+_C.INPUT.CONTRAST = 0.0
+_C.INPUT.SATURATION = 0.0
+_C.INPUT.HUE = 0.0
+
 
 # -----------------------------------------------------------------------------
 # Dataset
@@ -92,8 +98,6 @@
 
 # Add StopGrad at a specified stage so the bottom layers are frozen
 _C.MODEL.BACKBONE.FREEZE_CONV_BODY_AT = 2
-# GN for backbone
-_C.MODEL.BACKBONE.USE_GN = False
 
 
 # ---------------------------------------------------------------------------- #
@@ -159,6 +163,9 @@
 # all FPN levels
 _C.MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN = 2000
 _C.MODEL.RPN.FPN_POST_NMS_TOP_N_TEST = 2000
+# Apply the post NMS per batch (default) or per image during training
+# (default is True to be consistent with Detectron, see Issue #672)
+_C.MODEL.RPN.FPN_POST_NMS_PER_BATCH = True
 # Custom rpn head, empty to use default conv or separable conv
 _C.MODEL.RPN.RPN_HEAD = "SingleConvRPNHead"
 
@@ -274,6 +281,10 @@
 _C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256
 _C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64
 
+_C.MODEL.RESNETS.STAGE_WITH_DCN = (False, False, False, False)
+_C.MODEL.RESNETS.WITH_MODULATED_DCN = False
+_C.MODEL.RESNETS.DEFORMABLE_GROUPS = 1
+
 
 # ---------------------------------------------------------------------------- #
 # RetinaNet Options (Follow the Detectron version)
@@ -414,6 +425,27 @@
 # Number of detections per image
 _C.TEST.DETECTIONS_PER_IMG = 100
 
+# ---------------------------------------------------------------------------- #
+# Test-time augmentations for bounding box detection
+# See configs/test_time_aug/e2e_mask_rcnn_R-50-FPN_1x.yaml for an example
+# ---------------------------------------------------------------------------- #
+_C.TEST.BBOX_AUG = CN()
+
+# Enable test-time augmentation for bounding box detection if True
+_C.TEST.BBOX_AUG.ENABLED = False
+
+# Horizontal flip at the original scale (id transform)
+_C.TEST.BBOX_AUG.H_FLIP = False
+
+# Each scale is the pixel size of an image's shortest side
+_C.TEST.BBOX_AUG.SCALES = ()
+
+# Max pixel size of the longer side
+_C.TEST.BBOX_AUG.MAX_SIZE = 4000
+
+# Horizontal flip at each scale
+_C.TEST.BBOX_AUG.SCALE_H_FLIP = False
+
 
 # ---------------------------------------------------------------------------- #
 # Misc options
@@ -421,3 +453,13 @@
 _C.OUTPUT_DIR = "."
 
 _C.PATHS_CATALOG = os.path.join(os.path.dirname(__file__), "paths_catalog.py")
+
+# ---------------------------------------------------------------------------- #
+# Precision options
+# ---------------------------------------------------------------------------- #
+
+# Precision of input, allowable: (float32, float16)
+_C.DTYPE = "float32"
+
+# Enable verbosity in apex.amp
+_C.AMP_VERBOSE = False
diff --git a/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
index cd9fde2ae..d35aedf27 100644
--- a/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
+++ b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp
@@ -239,7 +239,7 @@ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
     return output;
   }
 
-  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] {
     ROIAlignForward_cpu_kernel<scalar_t>(
          output_size,
          input.data<scalar_t>(),
diff --git a/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp
index 639ca472e..1153dea04 100644
--- a/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp
+++ b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp
@@ -68,7 +68,7 @@ at::Tensor nms_cpu(const at::Tensor& dets,
                const at::Tensor& scores,
                const float threshold) {
   at::Tensor result;
-  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
+  AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
     result = nms_cpu_kernel<scalar_t>(dets, scores, threshold);
   });
   return result;
diff --git a/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu
index 170771aa8..1142fb375 100644
--- a/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu
+++ b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu
@@ -280,7 +280,7 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
     return output;
   }
 
-  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] {
     RoIAlignForward<scalar_t><<<grid, block, 0, stream>>>(
          output_size,
          input.contiguous().data<scalar_t>(),
@@ -326,7 +326,7 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
     return grad_input;
   }
 
-  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
+  AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] {
     RoIAlignBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
          grad.numel(),
          grad.contiguous().data<scalar_t>(),
diff --git a/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu
index cef3beaa4..8f072ffc2 100644
--- a/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu
+++ b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu
@@ -134,7 +134,7 @@ std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
     return std::make_tuple(output, argmax);
   }
 
-  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIPool_forward", [&] {
+  AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIPool_forward", [&] {
     RoIPoolFForward<scalar_t><<<grid, block, 0, stream>>>(
          output_size,
          input.contiguous().data<scalar_t>(),
@@ -182,7 +182,7 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
     return grad_input;
   }
 
-  AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIPool_backward", [&] {
+  AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIPool_backward", [&] {
     RoIPoolFBackward<scalar_t><<<grid, block, 0, stream>>>(
          grad.numel(),
          grad.contiguous().data<scalar_t>(),
diff --git a/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
index cd9b4c96b..456a5f235 100644
--- a/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
+++ b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu
@@ -117,7 +117,8 @@ at::Tensor SigmoidFocalLoss_forward_cuda(
   auto losses_size = num_samples * logits.size(1);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 grid(std::min(THCCeilDiv(losses_size, 512L), 4096L));
+  dim3 grid(std::min(THCCeilDiv((long)losses_size, 512L), 4096L));
+  
   dim3 block(512);
 
   if (losses.numel() == 0) {
@@ -125,7 +126,7 @@ at::Tensor SigmoidFocalLoss_forward_cuda(
     return losses;
   }
 
-  AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_forward", [&] {
+  AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_forward", [&] {
     SigmoidFocalLossForward<scalar_t><<<grid, block, 0, stream>>>(
          losses_size,
          logits.contiguous().data<scalar_t>(),
@@ -161,7 +162,7 @@ at::Tensor SigmoidFocalLoss_backward_cuda(
   auto d_logits_size = num_samples * logits.size(1);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 grid(std::min(THCCeilDiv(d_logits_size, 512L), 4096L));
+  dim3 grid(std::min(THCCeilDiv((long)d_logits_size, 512L), 4096L));
   dim3 block(512);
 
   if (d_logits.numel() == 0) {
@@ -169,7 +170,7 @@ at::Tensor SigmoidFocalLoss_backward_cuda(
     return d_logits;
   }
 
-  AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_backward", [&] {
+  AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_backward", [&] {
     SigmoidFocalLossBackward<scalar_t><<<grid, block, 0, stream>>>(
          d_logits_size,
          logits.contiguous().data<scalar_t>(),
diff --git a/maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu b/maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu
new file mode 100644
index 000000000..74f7d3399
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/deform_conv_cuda.cu
@@ -0,0 +1,691 @@
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCDeviceUtils.cuh>
+
+#include <vector>
+#include <iostream>
+#include <cmath>
+
+
+void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset,
+                       const int channels, const int height, const int width,
+                       const int ksize_h, const int ksize_w, const int pad_h,
+                       const int pad_w, const int stride_h, const int stride_w,
+                       const int dilation_h, const int dilation_w,
+                       const int parallel_imgs, const int deformable_group,
+                       at::Tensor data_col);
+
+void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset,
+                       const int channels, const int height, const int width,
+                       const int ksize_h, const int ksize_w, const int pad_h,
+                       const int pad_w, const int stride_h, const int stride_w,
+                       const int dilation_h, const int dilation_w,
+                       const int parallel_imgs, const int deformable_group,
+                       at::Tensor grad_im);
+
+void deformable_col2im_coord(
+    const at::Tensor data_col, const at::Tensor data_im,
+    const at::Tensor data_offset, const int channels, const int height,
+    const int width, const int ksize_h, const int ksize_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, at::Tensor grad_offset);
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im, const at::Tensor data_offset,
+    const at::Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kenerl_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    at::Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col, const at::Tensor data_offset,
+    const at::Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kenerl_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    at::Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col, const at::Tensor data_im,
+    const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, at::Tensor grad_offset,
+    at::Tensor grad_mask);
+
+void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
+                 at::Tensor weight, int kH, int kW, int dH, int dW, int padH,
+                 int padW, int dilationH, int dilationW, int group,
+                 int deformable_group) 
+{
+  AT_CHECK(weight.ndimension() == 4,
+           "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+           "but got: %s",
+           weight.ndimension());
+
+  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  AT_CHECK(kW > 0 && kH > 0,
+           "kernel size should be greater than zero, but got kH: %d kW: %d", kH,
+           kW);
+
+  AT_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+           "kernel size should be consistent with weight, ",
+           "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
+           kW, weight.size(2), weight.size(3));
+
+  AT_CHECK(dW > 0 && dH > 0,
+           "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  AT_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  AT_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
+           ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  AT_CHECK(nInputPlane % deformable_group == 0,
+           "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+
+  AT_CHECK(input.size(1) == nInputPlane,
+           "invalid number of input planes, expected: %d, but got: %d",
+           nInputPlane, input.size(1));
+
+  AT_CHECK((inputHeight >= kH && inputWidth >= kW),
+           "input image is smaller than kernel");
+
+  AT_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+           "invalid spatial size of offset, expected height: %d width: %d, but "
+           "got height: %d width: %d",
+           outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+  AT_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+           "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    AT_CHECK(gradOutput->size(dimf) == nOutputPlane,
+             "invalid number of gradOutput planes, expected: %d, but got: %d",
+             nOutputPlane, gradOutput->size(dimf));
+
+    AT_CHECK((gradOutput->size(dimh) == outputHeight &&
+              gradOutput->size(dimw) == outputWidth),
+             "invalid size of gradOutput, expected height: %d width: %d , but "
+             "got height: %d width: %d",
+             outputHeight, outputWidth, gradOutput->size(dimh),
+             gradOutput->size(dimw));
+  }
+}
+
+int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
+                             at::Tensor offset, at::Tensor output,
+                             at::Tensor columns, at::Tensor ones, int kW,
+                             int kH, int dW, int dH, int padW, int padH,
+                             int dilationW, int dilationH, int group,
+                             int deformable_group, int im2col_step) 
+{
+  // todo: resize columns to include im2col: done
+  // todo: add im2col_step as input
+  // todo: add new output buffer and transpose it to output (or directly
+  // transpose output) todo: possibly change data indexing because of
+  // parallel_imgs
+
+  shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW,
+              dilationH, dilationW, group, deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  at::Tensor output_buffer =
+      at::zeros({batchSize / im2col_step, nOutputPlane,
+                 im2col_step * outputHeight, outputWidth},
+                output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
+                                    at::Tensor gradOutput, at::Tensor gradInput,
+                                    at::Tensor gradOffset, at::Tensor weight,
+                                    at::Tensor columns, int kW, int kH, int dW,
+                                    int dH, int padW, int padH, int dilationW,
+                                    int dilationH, int group,
+                                    int deformable_group, int im2col_step) 
+{
+  shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW,
+              dilationH, dilationW, group, deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  AT_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+    deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
+                            inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+                            dilationH, dilationW, im2col_step, deformable_group,
+                            gradOffset[elt]);
+
+    deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, gradInput[elt]);
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
+    at::Tensor gradWeight,  // at::Tensor gradBias,
+    at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
+    int padW, int padH, int dilationW, int dilationH, int group,
+    int deformable_group, float scale, int im2col_step) 
+{
+  // todo: transpose and reshape outGrad
+  // todo: reshape columns
+  // todo: add im2col_step as input
+
+  shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH,
+              padW, dilationH, dilationW, group, deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+
+  return 1;
+}
+
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+    at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
+    int kernel_h, int kernel_w, const int stride_h, const int stride_w,
+    const int pad_h, const int pad_w, const int dilation_h,
+    const int dilation_w, const int group, const int deformable_group,
+    const bool with_bias) 
+{
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());
+
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_cuda(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    // divide into group
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+    at::Tensor offset, at::Tensor mask, at::Tensor columns,
+    at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
+    at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) 
+{
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());
+
+  grad_output =
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_cuda(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_cuda(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_cuda(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/deform_conv_kernel_cuda.cu b/maskrcnn_benchmark/csrc/cuda/deform_conv_kernel_cuda.cu
new file mode 100644
index 000000000..b4f8813b4
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/deform_conv_kernel_cuda.cu
@@ -0,0 +1,874 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+
+#include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+const int kMaxGridNum = 65535;
+inline int GET_BLOCKS(const int N)
+{
+  return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
+}
+
+/*
+const int CUDA_NUM_THREADS = 1024;
+
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}*/
+
+template <typename scalar_t>
+__device__ scalar_t deformable_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
+                                               const int height, const int width, scalar_t h, scalar_t w)
+{
+
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                        const int h, const int w, const int height, const int width)
+{
+
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                          const int height, const int width, const scalar_t *im_data,
+                                          const int data_width, const int bp_dir)
+{
+
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+  else if (bp_dir == 1)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset,
+                                             const int height, const int width, const int kernel_h, const int kernel_w,
+                                             const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                             const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
+                                             const int batch_size, const int num_channels, const int deformable_group,
+                                             const int height_col, const int width_col,
+                                             scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    //const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+    const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+        {
+          //const scalar_t map_h = i * dilation_h + offset_h;
+          //const scalar_t map_w = j * dilation_w + offset_w;
+          //const int cur_height = height - h_in;
+          //const int cur_width = width - w_in;
+          //val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+          val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+void deformable_im2col(
+    const at::Tensor data_im, const at::Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, at::Tensor data_col)
+{
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        scalar_t *data_col_ = data_col.data<scalar_t>();
+
+        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels, deformable_group,
+            height_col, width_col, data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
+  }
+}
+
+template <typename scalar_t>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n, const scalar_t *data_col, const scalar_t *data_offset,
+    const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size, const int deformable_group,
+    const int height_col, const int width_col,
+    scalar_t *grad_im)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) *
+                                                        2 * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++)
+    {
+      for (int dx = -2; dx <= 2; dx++)
+      {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+            cur_w + dx >= 0 && cur_w + dx < width &&
+            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1)
+        {
+          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+void deformable_col2im(
+    const at::Tensor data_col, const at::Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h,
+    const int ksize_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int parallel_imgs, const int deformable_group,
+    at::Tensor grad_im)
+{
+
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data<scalar_t>();
+
+        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_offset_, channels, height, width, ksize_h,
+            ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            parallel_imgs, deformable_group, height_col, width_col, grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
+  }
+}
+
+template <typename scalar_t>
+__global__ void deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col,
+                                                   const scalar_t *data_im, const scalar_t *data_offset,
+                                                   const int channels, const int height, const int width,
+                                                   const int kernel_h, const int kernel_w,
+                                                   const int pad_h, const int pad_w,
+                                                   const int stride_h, const int stride_w,
+                                                   const int dilation_h, const int dilation_w,
+                                                   const int channel_per_deformable_group,
+                                                   const int batch_size, const int offset_channels, const int deformable_group,
+                                                   const int height_col, const int width_col, scalar_t *grad_offset)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    scalar_t val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group *
+                                                  batch_size * width_col * height_col;
+    const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) *
+                                                channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                                                        kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+    {
+      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+      {
+        inv_h = inv_w = -2;
+      }
+      const scalar_t weight = get_coordinate_weight(
+          inv_h, inv_w,
+          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+void deformable_col2im_coord(
+    const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset,
+    const int channels, const int height, const int width, const int ksize_h,
+    const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int parallel_imgs, const int deformable_group, at::Tensor grad_offset)
+{
+
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs;
+  int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
+
+        deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
+                                         const int height, const int width, scalar_t h, scalar_t w)
+{
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                             const int h, const int w, const int height, const int width)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                               const int height, const int width, const scalar_t *im_data,
+                                               const int data_width, const int bp_dir)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+  else if (bp_dir == 1)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
+                                                       const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask,
+                                                       const int height, const int width, const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int num_channels, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+    const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    const scalar_t *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+        {
+          //const float map_h = i * dilation_h + offset_h;
+          //const float map_w = j * dilation_w + offset_w;
+          //const int cur_height = height - h_in;
+          //const int cur_width = width - w_in;
+          //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+        //data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_gpu_kernel(const int n,
+                                                       const scalar_t *data_col, const scalar_t *data_offset, const scalar_t *data_mask,
+                                                       const int channels, const int height, const int width,
+                                                       const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       scalar_t *grad_im)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++)
+    {
+      for (int dx = -2; dx <= 2; dx++)
+      {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+            cur_w + dx >= 0 && cur_w + dx < width &&
+            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1)
+        {
+          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
+                                                             const scalar_t *data_col, const scalar_t *data_im,
+                                                             const scalar_t *data_offset, const scalar_t *data_mask,
+                                                             const int channels, const int height, const int width,
+                                                             const int kernel_h, const int kernel_w,
+                                                             const int pad_h, const int pad_w,
+                                                             const int stride_h, const int stride_w,
+                                                             const int dilation_h, const int dilation_w,
+                                                             const int channel_per_deformable_group,
+                                                             const int batch_size, const int offset_channels, const int deformable_group,
+                                                             const int height_col, const int width_col,
+                                                             scalar_t *grad_offset, scalar_t *grad_mask)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    scalar_t val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
+    const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+    {
+      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+      {
+        inv_h = inv_w = -2;
+      }
+      else
+      {
+        mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
+      }
+      const scalar_t weight = dmcn_get_coordinate_weight(
+          inv_h, inv_w,
+          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im, const int width_im,
+    const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int deformable_group, at::Tensor data_col)
+{
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
+        scalar_t *data_col_ = data_col.data<scalar_t>();
+
+        modulated_deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, channels, deformable_group, height_col, width_col, data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im, const int width_im,
+    const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int deformable_group, at::Tensor grad_im)
+{
+
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data<scalar_t>();
+
+        modulated_deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im,
+            kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im, const int width_im,
+    const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_offset, at::Tensor grad_mask)
+{
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
+  const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data<scalar_t>();
+
+        modulated_deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im,
+            kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col,
+            grad_offset_, grad_mask_);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu b/maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu
new file mode 100644
index 000000000..71f305af9
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/deform_pool_cuda.cu
@@ -0,0 +1,87 @@
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c
+
+// based on
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THC.h>
+#include <THC/THCDeviceUtils.cuh>
+
+#include <vector>
+#include <iostream>
+#include <cmath>
+
+
+void DeformablePSROIPoolForward(
+    const at::Tensor data, const at::Tensor bbox, const at::Tensor trans,
+    at::Tensor out, at::Tensor top_count, const int batch, const int channels,
+    const int height, const int width, const int num_bbox,
+    const int channels_trans, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std);
+
+void DeformablePSROIPoolBackwardAcc(
+    const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox,
+    const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad,
+    at::Tensor trans_grad, const int batch, const int channels,
+    const int height, const int width, const int num_bbox,
+    const int channels_trans, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std);
+
+void deform_psroi_pooling_cuda_forward(
+    at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
+    at::Tensor top_count, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std) 
+{
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+
+  const int num_bbox = bbox.size(0);
+  if (num_bbox != out.size(0))
+    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+             out.size(0), num_bbox);
+
+  DeformablePSROIPoolForward(
+      input, bbox, trans, out, top_count, batch, channels, height, width,
+      num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size,
+      pooled_size, part_size, sample_per_part, trans_std);
+}
+
+void deform_psroi_pooling_cuda_backward(
+    at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
+    at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
+    const int no_trans, const float spatial_scale, const int output_dim,
+    const int group_size, const int pooled_size, const int part_size,
+    const int sample_per_part, const float trans_std) 
+{
+  AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
+  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+
+  const int num_bbox = bbox.size(0);
+  if (num_bbox != out_grad.size(0))
+    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+             out_grad.size(0), num_bbox);
+
+  DeformablePSROIPoolBackwardAcc(
+      out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch,
+      channels, height, width, num_bbox, channels_trans, no_trans,
+      spatial_scale, output_dim, group_size, pooled_size, part_size,
+      sample_per_part, trans_std);
+}
diff --git a/maskrcnn_benchmark/csrc/cuda/deform_pool_kernel_cuda.cu b/maskrcnn_benchmark/csrc/cuda/deform_pool_kernel_cuda.cu
new file mode 100644
index 000000000..127899ec6
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/cuda/deform_pool_kernel_cuda.cu
@@ -0,0 +1,365 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file deformable_psroi_pooling.cu
+ * \brief
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
+*/
+/***************** Adapted by Charles Shang *********************/
+// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu
+
+
+#include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
+#include <stdio.h>
+#include <math.h>
+#include <algorithm>
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n)                        \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n);                                       \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename scalar_t>
+__device__ scalar_t bilinear_interp(
+    const scalar_t *data,
+    const scalar_t x,
+    const scalar_t y,
+    const int width,
+    const int height)
+{
+  int x1 = floor(x);
+  int x2 = ceil(x);
+  int y1 = floor(y);
+  int y2 = ceil(y);
+  scalar_t dist_x = (scalar_t)(x - x1);
+  scalar_t dist_y = (scalar_t)(y - y1);
+  scalar_t value11 = data[y1 * width + x1];
+  scalar_t value12 = data[y2 * width + x1];
+  scalar_t value21 = data[y1 * width + x2];
+  scalar_t value22 = data[y2 * width + x2];
+  scalar_t value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22;
+  return value;
+}
+
+template <typename scalar_t>
+__global__ void DeformablePSROIPoolForwardKernel(
+    const int count,
+    const scalar_t *bottom_data,
+    const scalar_t spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const scalar_t *bottom_rois, const scalar_t *bottom_trans,
+    const int no_trans,
+    const scalar_t trans_std,
+    const int sample_per_part,
+    const int output_dim,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class,
+    scalar_t *top_data,
+    scalar_t *top_count)
+{
+  CUDA_KERNEL_LOOP(index, count)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+    scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
+    scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
+
+    scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
+    scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
+
+    int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
+    int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+    scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+
+    scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    scalar_t sum = 0;
+    int count = 0;
+    int gw = floor((scalar_t)(pw)*group_size / pooled_width);
+    int gh = floor((scalar_t)(ph)*group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+
+    const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        scalar_t w = wstart + iw * sub_bin_size_w;
+        scalar_t h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_size + gh) * group_size + gw;
+        scalar_t val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
+        sum += val;
+        count++;
+      }
+    }
+    top_data[index] = count == 0 ? (scalar_t)(0) : sum / count;
+    top_count[index] = count;
+  }
+}
+
+template <typename scalar_t>
+__global__ void DeformablePSROIPoolBackwardAccKernel(
+    const int count,
+    const scalar_t *top_diff,
+    const scalar_t *top_count,
+    const int num_rois,
+    const scalar_t spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int output_dim,
+    scalar_t *bottom_data_diff, scalar_t *bottom_trans_diff,
+    const scalar_t *bottom_data,
+    const scalar_t *bottom_rois,
+    const scalar_t *bottom_trans,
+    const int no_trans,
+    const scalar_t trans_std,
+    const int sample_per_part,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class)
+{
+  CUDA_KERNEL_LOOP(index, count)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+    scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
+    scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
+
+    scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
+    scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
+
+    int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
+    int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+    scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+
+    scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    if (top_count[index] <= 0)
+    {
+      continue;
+    }
+    scalar_t diff_val = top_diff[index] / top_count[index];
+    const scalar_t *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
+    scalar_t *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
+    int gw = floor((scalar_t)(pw)*group_size / pooled_width);
+    int gh = floor((scalar_t)(ph)*group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        scalar_t w = wstart + iw * sub_bin_size_w;
+        scalar_t h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_size + gh) * group_size + gw;
+        // backward on feature
+        int x0 = floor(w);
+        int x1 = ceil(w);
+        int y0 = floor(h);
+        int y1 = ceil(h);
+        scalar_t dist_x = w - x0, dist_y = h - y0;
+        scalar_t q00 = (1 - dist_x) * (1 - dist_y);
+        scalar_t q01 = (1 - dist_x) * dist_y;
+        scalar_t q10 = dist_x * (1 - dist_y);
+        scalar_t q11 = dist_x * dist_y;
+        int bottom_index_base = c * height * width;
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
+
+        if (no_trans)
+        {
+          continue;
+        }
+        scalar_t U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+        scalar_t U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+        scalar_t U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+        scalar_t U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+        scalar_t diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
+        diff_x *= roi_width;
+        scalar_t diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
+        diff_y *= roi_height;
+
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
+      }
+    }
+  }
+}
+
+void DeformablePSROIPoolForward(const at::Tensor data,
+                                const at::Tensor bbox,
+                                const at::Tensor trans,
+                                at::Tensor out,
+                                at::Tensor top_count,
+                                const int batch,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int num_bbox,
+                                const int channels_trans,
+                                const int no_trans,
+                                const float spatial_scale,
+                                const int output_dim,
+                                const int group_size,
+                                const int pooled_size,
+                                const int part_size,
+                                const int sample_per_part,
+                                const float trans_std)
+{
+  const int pooled_height = pooled_size;
+  const int pooled_width = pooled_size;
+  const int count = num_bbox * output_dim * pooled_height * pooled_width;
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data.type(), "deformable_psroi_pool_forward", ([&] {
+        const scalar_t *bottom_data = data.data<scalar_t>();
+        const scalar_t *bottom_rois = bbox.data<scalar_t>();
+        const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
+        scalar_t *top_data = out.data<scalar_t>();
+        scalar_t *top_count_data = top_count.data<scalar_t>();
+
+        DeformablePSROIPoolForwardKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
+            count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width,
+            bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, output_dim,
+            group_size, part_size, num_classes, channels_each_class, top_data, top_count_data);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
+  }
+}
+
+void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
+                                    const at::Tensor data,
+                                    const at::Tensor bbox,
+                                    const at::Tensor trans,
+                                    const at::Tensor top_count,
+                                    at::Tensor in_grad,
+                                    at::Tensor trans_grad,
+                                    const int batch,
+                                    const int channels,
+                                    const int height,
+                                    const int width,
+                                    const int num_bbox,
+                                    const int channels_trans,
+                                    const int no_trans,
+                                    const float spatial_scale,
+                                    const int output_dim,
+                                    const int group_size,
+                                    const int pooled_size,
+                                    const int part_size,
+                                    const int sample_per_part,
+                                    const float trans_std)
+{
+  // LOG(INFO) << "DeformablePSROIPoolBackward";
+  const int num_rois = num_bbox;
+  const int pooled_height = pooled_size;
+  const int pooled_width = pooled_size;
+  const int count = num_bbox * output_dim * pooled_height * pooled_width;
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      out_grad.type(), "deformable_psroi_pool_backward_acc", ([&] {
+        const scalar_t *top_diff = out_grad.data<scalar_t>();
+        const scalar_t *bottom_data = data.data<scalar_t>();
+        const scalar_t *bottom_rois = bbox.data<scalar_t>();
+        const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
+        scalar_t *bottom_data_diff = in_grad.data<scalar_t>();
+        scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data<scalar_t>();
+        const scalar_t *top_count_data = top_count.data<scalar_t>();
+
+        DeformablePSROIPoolBackwardAccKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
+            count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width,
+            pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff,
+            bottom_data, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part,
+            group_size, part_size, num_classes, channels_each_class);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
+  }
+}
\ No newline at end of file
diff --git a/maskrcnn_benchmark/csrc/cuda/vision.h b/maskrcnn_benchmark/csrc/cuda/vision.h
index 6d9f8871f..32d3c6956 100644
--- a/maskrcnn_benchmark/csrc/cuda/vision.h
+++ b/maskrcnn_benchmark/csrc/cuda/vision.h
@@ -58,6 +58,59 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
 at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);
 
 
+int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
+                             at::Tensor offset, at::Tensor output,
+                             at::Tensor columns, at::Tensor ones, int kW,
+                             int kH, int dW, int dH, int padW, int padH,
+                             int dilationW, int dilationH, int group,
+                             int deformable_group, int im2col_step);
+
+int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
+                                    at::Tensor gradOutput, at::Tensor gradInput,
+                                    at::Tensor gradOffset, at::Tensor weight,
+                                    at::Tensor columns, int kW, int kH, int dW,
+                                    int dH, int padW, int padH, int dilationW,
+                                    int dilationH, int group,
+                                    int deformable_group, int im2col_step);
+
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
+    at::Tensor gradWeight,  // at::Tensor gradBias,
+    at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
+    int padW, int padH, int dilationW, int dilationH, int group,
+    int deformable_group, float scale, int im2col_step);
+
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+    at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
+    int kernel_h, int kernel_w, const int stride_h, const int stride_w,
+    const int pad_h, const int pad_w, const int dilation_h,
+    const int dilation_w, const int group, const int deformable_group,
+    const bool with_bias);
+
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+    at::Tensor offset, at::Tensor mask, at::Tensor columns,
+    at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
+    at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias);
+
+void deform_psroi_pooling_cuda_forward(
+    at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
+    at::Tensor top_count, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std);
+
+void deform_psroi_pooling_cuda_backward(
+    at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
+    at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
+    const int no_trans, const float spatial_scale, const int output_dim,
+    const int group_size, const int pooled_size, const int part_size,
+    const int sample_per_part, const float trans_std);
+
+
 at::Tensor compute_flow_cuda(const at::Tensor& boxes,
                              const int height,
                              const int width);
diff --git a/maskrcnn_benchmark/csrc/deform_conv.h b/maskrcnn_benchmark/csrc/deform_conv.h
new file mode 100644
index 000000000..a5930e390
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/deform_conv.h
@@ -0,0 +1,191 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+
+// Interface for Python
+int deform_conv_forward(
+    at::Tensor input, 
+    at::Tensor weight,
+    at::Tensor offset, 
+    at::Tensor output,
+    at::Tensor columns, 
+    at::Tensor ones, 
+    int kW,
+    int kH, 
+    int dW, 
+    int dH, 
+    int padW, 
+    int padH,
+    int dilationW, 
+    int dilationH, 
+    int group,
+    int deformable_group, 
+    int im2col_step)
+{
+  if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return deform_conv_forward_cuda(
+        input, weight, offset, output, columns, ones,
+        kW, kH, dW, dH, padW, padH, dilationW, dilationH,
+        group, deformable_group, im2col_step
+    );
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+
+int deform_conv_backward_input(
+    at::Tensor input, 
+    at::Tensor offset,
+    at::Tensor gradOutput, 
+    at::Tensor gradInput,
+    at::Tensor gradOffset, 
+    at::Tensor weight,
+    at::Tensor columns, 
+    int kW, 
+    int kH, 
+    int dW,
+    int dH, 
+    int padW, 
+    int padH, 
+    int dilationW,
+    int dilationH, 
+    int group,
+    int deformable_group, 
+    int im2col_step)
+{
+  if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return deform_conv_backward_input_cuda(
+        input, offset, gradOutput, gradInput, gradOffset, weight, columns,
+        kW, kH, dW, dH, padW, padH, dilationW, dilationH, 
+        group, deformable_group, im2col_step
+    );
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+
+int deform_conv_backward_parameters(
+    at::Tensor input, 
+    at::Tensor offset, 
+    at::Tensor gradOutput,
+    at::Tensor gradWeight,  // at::Tensor gradBias,
+    at::Tensor columns, 
+    at::Tensor ones, 
+    int kW, 
+    int kH, 
+    int dW, 
+    int dH,
+    int padW, 
+    int padH, 
+    int dilationW, 
+    int dilationH, 
+    int group,
+    int deformable_group, 
+    float scale, 
+    int im2col_step)
+{
+  if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return deform_conv_backward_parameters_cuda(
+        input, offset, gradOutput, gradWeight, columns, ones,
+        kW, kH, dW, dH, padW, padH, dilationW, dilationH,
+        group, deformable_group, scale, im2col_step
+    );
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+
+void modulated_deform_conv_forward(
+    at::Tensor input, 
+    at::Tensor weight, 
+    at::Tensor bias, 
+    at::Tensor ones,
+    at::Tensor offset, 
+    at::Tensor mask, 
+    at::Tensor output, 
+    at::Tensor columns,
+    int kernel_h, 
+    int kernel_w, 
+    const int stride_h, 
+    const int stride_w,
+    const int pad_h, 
+    const int pad_w, 
+    const int dilation_h,
+    const int dilation_w, 
+    const int group, 
+    const int deformable_group,
+    const bool with_bias)
+{
+  if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return modulated_deform_conv_cuda_forward(
+        input, weight, bias, ones, offset, mask, output, columns,
+        kernel_h, kernel_w, stride_h, stride_w, 
+        pad_h, pad_w, dilation_h, dilation_w,
+        group, deformable_group, with_bias
+    );
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+
+void modulated_deform_conv_backward(
+    at::Tensor input, 
+    at::Tensor weight, 
+    at::Tensor bias, 
+    at::Tensor ones,
+    at::Tensor offset, 
+    at::Tensor mask, 
+    at::Tensor columns,
+    at::Tensor grad_input, 
+    at::Tensor grad_weight, 
+    at::Tensor grad_bias,
+    at::Tensor grad_offset, 
+    at::Tensor grad_mask, 
+    at::Tensor grad_output,
+    int kernel_h, 
+    int kernel_w, 
+    int stride_h, 
+    int stride_w, 
+    int pad_h,
+    int pad_w, 
+    int dilation_h, 
+    int dilation_w, 
+    int group, 
+    int deformable_group,
+    const bool with_bias)
+{
+  if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return modulated_deform_conv_cuda_backward(
+        input, weight, bias, ones, offset, mask, columns, 
+        grad_input, grad_weight, grad_bias, grad_offset, grad_mask, grad_output,
+        kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w,
+        group, deformable_group, with_bias
+    );
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
\ No newline at end of file
diff --git a/maskrcnn_benchmark/csrc/deform_pool.h b/maskrcnn_benchmark/csrc/deform_pool.h
new file mode 100644
index 000000000..234223809
--- /dev/null
+++ b/maskrcnn_benchmark/csrc/deform_pool.h
@@ -0,0 +1,70 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+#pragma once
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+#endif
+
+
+// Interface for Python
+void deform_psroi_pooling_forward(
+    at::Tensor input, 
+    at::Tensor bbox, 
+    at::Tensor trans, 
+    at::Tensor out,
+    at::Tensor top_count, 
+    const int no_trans, 
+    const float spatial_scale,
+    const int output_dim, 
+    const int group_size, 
+    const int pooled_size,
+    const int part_size, 
+    const int sample_per_part, 
+    const float trans_std)
+{
+  if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return deform_psroi_pooling_cuda_forward(
+        input, bbox, trans, out, top_count, 
+        no_trans, spatial_scale, output_dim, group_size,
+        pooled_size, part_size, sample_per_part, trans_std
+    );
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+
+
+void deform_psroi_pooling_backward(
+    at::Tensor out_grad, 
+    at::Tensor input, 
+    at::Tensor bbox, 
+    at::Tensor trans,
+    at::Tensor top_count, 
+    at::Tensor input_grad, 
+    at::Tensor trans_grad,
+    const int no_trans, 
+    const float spatial_scale, 
+    const int output_dim,
+    const int group_size, 
+    const int pooled_size, 
+    const int part_size,
+    const int sample_per_part, 
+    const float trans_std) 
+{
+  if (input.type().is_cuda()) {
+#ifdef WITH_CUDA
+    return deform_psroi_pooling_cuda_backward(
+        out_grad, input, bbox, trans, top_count, input_grad, trans_grad,
+        no_trans, spatial_scale, output_dim, group_size, pooled_size, 
+        part_size, sample_per_part, trans_std
+    );
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
diff --git a/maskrcnn_benchmark/csrc/vision.cpp b/maskrcnn_benchmark/csrc/vision.cpp
index 8234f43b1..30971995d 100644
--- a/maskrcnn_benchmark/csrc/vision.cpp
+++ b/maskrcnn_benchmark/csrc/vision.cpp
@@ -3,6 +3,8 @@
 #include "ROIAlign.h"
 #include "ROIPool.h"
 #include "SigmoidFocalLoss.h"
+#include "deform_conv.h"
+#include "deform_pool.h"
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("nms", &nms, "non-maximum suppression");
@@ -12,4 +14,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward");
   m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward");
   m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward");
-}
+  // dcn-v2
+  m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward");
+  m.def("deform_conv_backward_input", &deform_conv_backward_input, "deform_conv_backward_input");
+  m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters, "deform_conv_backward_parameters");
+  m.def("modulated_deform_conv_forward", &modulated_deform_conv_forward, "modulated_deform_conv_forward");
+  m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward, "modulated_deform_conv_backward");
+  m.def("deform_psroi_pooling_forward", &deform_psroi_pooling_forward, "deform_psroi_pooling_forward");
+  m.def("deform_psroi_pooling_backward", &deform_psroi_pooling_backward, "deform_psroi_pooling_backward");
+}
\ No newline at end of file
diff --git a/maskrcnn_benchmark/data/build.py b/maskrcnn_benchmark/data/build.py
index d2895fd7e..b0ce3c348 100644
--- a/maskrcnn_benchmark/data/build.py
+++ b/maskrcnn_benchmark/data/build.py
@@ -10,7 +10,7 @@
 from . import datasets as D
 from . import samplers
 
-from .collate_batch import BatchCollator
+from .collate_batch import BatchCollator, BBoxAugCollator
 from .transforms import build_transforms
 
 
@@ -110,8 +110,8 @@ def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0):
         images_per_batch = cfg.SOLVER.IMS_PER_BATCH
         assert (
             images_per_batch % num_gpus == 0
-        ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number "
-        "of GPUs ({}) used.".format(images_per_batch, num_gpus)
+        ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format(
+            images_per_batch, num_gpus)
         images_per_gpu = images_per_batch // num_gpus
         shuffle = True
         num_iters = cfg.SOLVER.MAX_ITER
@@ -119,8 +119,8 @@ def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0):
         images_per_batch = cfg.TEST.IMS_PER_BATCH
         assert (
             images_per_batch % num_gpus == 0
-        ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number "
-        "of GPUs ({}) used.".format(images_per_batch, num_gpus)
+        ), "TEST.IMS_PER_BATCH ({}) must be divisible by the number of GPUs ({}) used.".format(
+            images_per_batch, num_gpus)
         images_per_gpu = images_per_batch // num_gpus
         shuffle = False if not is_distributed else True
         num_iters = None
@@ -150,7 +150,8 @@ def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0):
     DatasetCatalog = paths_catalog.DatasetCatalog
     dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST
 
-    transforms = build_transforms(cfg, is_train)
+    # If bbox aug is enabled in testing, simply set transforms to None and we will apply transforms later
+    transforms = None if not is_train and cfg.TEST.BBOX_AUG.ENABLED else build_transforms(cfg, is_train)
     datasets = build_dataset(dataset_list, transforms, DatasetCatalog, is_train)
 
     data_loaders = []
@@ -159,7 +160,8 @@ def make_data_loader(cfg, is_train=True, is_distributed=False, start_iter=0):
         batch_sampler = make_batch_data_sampler(
             dataset, sampler, aspect_grouping, images_per_gpu, num_iters, start_iter
         )
-        collator = BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY)
+        collator = BBoxAugCollator() if not is_train and cfg.TEST.BBOX_AUG.ENABLED else \
+            BatchCollator(cfg.DATALOADER.SIZE_DIVISIBILITY)
         num_workers = cfg.DATALOADER.NUM_WORKERS
         data_loader = torch.utils.data.DataLoader(
             dataset,
diff --git a/maskrcnn_benchmark/data/collate_batch.py b/maskrcnn_benchmark/data/collate_batch.py
index a7f034167..56571f18c 100644
--- a/maskrcnn_benchmark/data/collate_batch.py
+++ b/maskrcnn_benchmark/data/collate_batch.py
@@ -18,3 +18,15 @@ def __call__(self, batch):
         targets = transposed_batch[1]
         img_ids = transposed_batch[2]
         return images, targets, img_ids
+
+
+class BBoxAugCollator(object):
+    """
+    From a list of samples from the dataset,
+    returns the images and targets.
+    Images should be converted to batched images in `im_detect_bbox_aug`
+    """
+
+    def __call__(self, batch):
+        return list(zip(*batch))
+
diff --git a/maskrcnn_benchmark/data/datasets/coco.py b/maskrcnn_benchmark/data/datasets/coco.py
index f0c8c25b4..cd9fc835e 100644
--- a/maskrcnn_benchmark/data/datasets/coco.py
+++ b/maskrcnn_benchmark/data/datasets/coco.py
@@ -61,7 +61,7 @@ def __init__(
             v: k for k, v in self.json_category_id_to_contiguous_id.items()
         }
         self.id_to_img_map = {k: v for k, v in enumerate(self.ids)}
-        self.transforms = transforms
+        self._transforms = transforms
 
     def __getitem__(self, idx):
         img, anno = super(COCODataset, self).__getitem__(idx)
@@ -79,9 +79,10 @@ def __getitem__(self, idx):
         classes = torch.tensor(classes)
         target.add_field("labels", classes)
 
-        masks = [obj["segmentation"] for obj in anno]
-        masks = SegmentationMask(masks, img.size)
-        target.add_field("masks", masks)
+        if anno and "segmentation" in anno[0]:
+            masks = [obj["segmentation"] for obj in anno]
+            masks = SegmentationMask(masks, img.size, mode='poly')
+            target.add_field("masks", masks)
 
         if anno and "keypoints" in anno[0]:
             keypoints = [obj["keypoints"] for obj in anno]
@@ -90,8 +91,8 @@ def __getitem__(self, idx):
 
         target = target.clip_to_image(remove_empty=True)
 
-        if self.transforms is not None:
-            img, target = self.transforms(img, target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
 
         return img, target, idx
 
diff --git a/maskrcnn_benchmark/data/datasets/evaluation/voc/voc_eval.py b/maskrcnn_benchmark/data/datasets/evaluation/voc/voc_eval.py
index f8b0c1084..693710902 100644
--- a/maskrcnn_benchmark/data/datasets/evaluation/voc/voc_eval.py
+++ b/maskrcnn_benchmark/data/datasets/evaluation/voc/voc_eval.py
@@ -16,8 +16,6 @@ def do_voc_evaluation(dataset, predictions, output_folder, logger):
     gt_boxlists = []
     for image_id, prediction in enumerate(predictions):
         img_info = dataset.get_img_info(image_id)
-        if len(prediction) == 0:
-            continue
         image_width = img_info["width"]
         image_height = img_info["height"]
         prediction = prediction.resize((image_width, image_height))
diff --git a/maskrcnn_benchmark/data/transforms/build.py b/maskrcnn_benchmark/data/transforms/build.py
index 8645d4df4..88aa975b6 100644
--- a/maskrcnn_benchmark/data/transforms/build.py
+++ b/maskrcnn_benchmark/data/transforms/build.py
@@ -7,18 +7,33 @@ def build_transforms(cfg, is_train=True):
         min_size = cfg.INPUT.MIN_SIZE_TRAIN
         max_size = cfg.INPUT.MAX_SIZE_TRAIN
         flip_prob = 0.5  # cfg.INPUT.FLIP_PROB_TRAIN
+        brightness = cfg.INPUT.BRIGHTNESS
+        contrast = cfg.INPUT.CONTRAST
+        saturation = cfg.INPUT.SATURATION
+        hue = cfg.INPUT.HUE
     else:
         min_size = cfg.INPUT.MIN_SIZE_TEST
         max_size = cfg.INPUT.MAX_SIZE_TEST
         flip_prob = 0
+        brightness = 0.0
+        contrast = 0.0
+        saturation = 0.0
+        hue = 0.0
 
     to_bgr255 = cfg.INPUT.TO_BGR255
     normalize_transform = T.Normalize(
         mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=to_bgr255
     )
+    color_jitter = T.ColorJitter(
+        brightness=brightness,
+        contrast=contrast,
+        saturation=saturation,
+        hue=hue,
+    )
 
     transform = T.Compose(
         [
+            color_jitter,
             T.Resize(min_size, max_size),
             T.RandomHorizontalFlip(flip_prob),
             T.ToTensor(),
diff --git a/maskrcnn_benchmark/data/transforms/transforms.py b/maskrcnn_benchmark/data/transforms/transforms.py
index 7e3ebbd6c..fa1d93934 100644
--- a/maskrcnn_benchmark/data/transforms/transforms.py
+++ b/maskrcnn_benchmark/data/transforms/transforms.py
@@ -54,9 +54,11 @@ def get_size(self, image_size):
 
         return (oh, ow)
 
-    def __call__(self, image, target):
+    def __call__(self, image, target=None):
         size = self.get_size(image.size)
         image = F.resize(image, size)
+        if target is None:
+            return image
         target = target.resize(image.size)
         return image, target
 
@@ -72,6 +74,24 @@ def __call__(self, image, target):
         return image, target
 
 
+class ColorJitter(object):
+    def __init__(self,
+                 brightness=None,
+                 contrast=None,
+                 saturation=None,
+                 hue=None,
+                 ):
+        self.color_jitter = torchvision.transforms.ColorJitter(
+            brightness=brightness,
+            contrast=contrast,
+            saturation=saturation,
+            hue=hue,)
+
+    def __call__(self, image, target):
+        image = self.color_jitter(image)
+        return image, target
+
+
 class ToTensor(object):
     def __call__(self, image, target):
         return F.to_tensor(image), target
@@ -83,8 +103,10 @@ def __init__(self, mean, std, to_bgr255=True):
         self.std = std
         self.to_bgr255 = to_bgr255
 
-    def __call__(self, image, target):
+    def __call__(self, image, target=None):
         if self.to_bgr255:
             image = image[[2, 1, 0]] * 255
         image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image
         return image, target
diff --git a/maskrcnn_benchmark/engine/bbox_aug.py b/maskrcnn_benchmark/engine/bbox_aug.py
new file mode 100644
index 000000000..444416538
--- /dev/null
+++ b/maskrcnn_benchmark/engine/bbox_aug.py
@@ -0,0 +1,118 @@
+import torch
+import torchvision.transforms as TT
+
+from maskrcnn_benchmark.config import cfg
+from maskrcnn_benchmark.data import transforms as T
+from maskrcnn_benchmark.structures.image_list import to_image_list
+from maskrcnn_benchmark.structures.bounding_box import BoxList
+from maskrcnn_benchmark.modeling.roi_heads.box_head.inference import make_roi_box_post_processor
+
+
+def im_detect_bbox_aug(model, images, device):
+    # Collect detections computed under different transformations
+    boxlists_ts = []
+    for _ in range(len(images)):
+        boxlists_ts.append([])
+
+    def add_preds_t(boxlists_t):
+        for i, boxlist_t in enumerate(boxlists_t):
+            if len(boxlists_ts[i]) == 0:
+                # The first one is identity transform, no need to resize the boxlist
+                boxlists_ts[i].append(boxlist_t)
+            else:
+                # Resize the boxlist as the first one
+                boxlists_ts[i].append(boxlist_t.resize(boxlists_ts[i][0].size))
+
+    # Compute detections for the original image (identity transform)
+    boxlists_i = im_detect_bbox(
+        model, images, cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST, device
+    )
+    add_preds_t(boxlists_i)
+
+    # Perform detection on the horizontally flipped image
+    if cfg.TEST.BBOX_AUG.H_FLIP:
+        boxlists_hf = im_detect_bbox_hflip(
+            model, images, cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MAX_SIZE_TEST, device
+        )
+        add_preds_t(boxlists_hf)
+
+    # Compute detections at different scales
+    for scale in cfg.TEST.BBOX_AUG.SCALES:
+        max_size = cfg.TEST.BBOX_AUG.MAX_SIZE
+        boxlists_scl = im_detect_bbox_scale(
+            model, images, scale, max_size, device
+        )
+        add_preds_t(boxlists_scl)
+
+        if cfg.TEST.BBOX_AUG.SCALE_H_FLIP:
+            boxlists_scl_hf = im_detect_bbox_scale(
+                model, images, scale, max_size, device, hflip=True
+            )
+            add_preds_t(boxlists_scl_hf)
+
+    # Merge boxlists detected by different bbox aug params
+    boxlists = []
+    for i, boxlist_ts in enumerate(boxlists_ts):
+        bbox = torch.cat([boxlist_t.bbox for boxlist_t in boxlist_ts])
+        scores = torch.cat([boxlist_t.get_field('scores') for boxlist_t in boxlist_ts])
+        boxlist = BoxList(bbox, boxlist_ts[0].size, boxlist_ts[0].mode)
+        boxlist.add_field('scores', scores)
+        boxlists.append(boxlist)
+
+    # Apply NMS and limit the final detections
+    results = []
+    post_processor = make_roi_box_post_processor(cfg)
+    for boxlist in boxlists:
+        results.append(post_processor.filter_results(boxlist, cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES))
+
+    return results
+
+
+def im_detect_bbox(model, images, target_scale, target_max_size, device):
+    """
+    Performs bbox detection on the original image.
+    """
+    transform = TT.Compose([
+        T.Resize(target_scale, target_max_size),
+        TT.ToTensor(),
+        T.Normalize(
+            mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=cfg.INPUT.TO_BGR255
+        )
+    ])
+    images = [transform(image) for image in images]
+    images = to_image_list(images, cfg.DATALOADER.SIZE_DIVISIBILITY)
+    return model(images.to(device))
+
+
+def im_detect_bbox_hflip(model, images, target_scale, target_max_size, device):
+    """
+    Performs bbox detection on the horizontally flipped image.
+    Function signature is the same as for im_detect_bbox.
+    """
+    transform = TT.Compose([
+        T.Resize(target_scale, target_max_size),
+        TT.RandomHorizontalFlip(1.0),
+        TT.ToTensor(),
+        T.Normalize(
+            mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD, to_bgr255=cfg.INPUT.TO_BGR255
+        )
+    ])
+    images = [transform(image) for image in images]
+    images = to_image_list(images, cfg.DATALOADER.SIZE_DIVISIBILITY)
+    boxlists = model(images.to(device))
+
+    # Invert the detections computed on the flipped image
+    boxlists_inv = [boxlist.transpose(0) for boxlist in boxlists]
+    return boxlists_inv
+
+
+def im_detect_bbox_scale(model, images, target_scale, target_max_size, device, hflip=False):
+    """
+    Computes bbox detections at the given scale.
+    Returns predictions in the scaled image space.
+    """
+    if hflip:
+        boxlists_scl = im_detect_bbox_hflip(model, images, target_scale, target_max_size, device)
+    else:
+        boxlists_scl = im_detect_bbox(model, images, target_scale, target_max_size, device)
+    return boxlists_scl
diff --git a/maskrcnn_benchmark/engine/inference.py b/maskrcnn_benchmark/engine/inference.py
index e125cb877..82d0abb6d 100644
--- a/maskrcnn_benchmark/engine/inference.py
+++ b/maskrcnn_benchmark/engine/inference.py
@@ -6,11 +6,13 @@
 import torch
 from tqdm import tqdm
 
+from maskrcnn_benchmark.config import cfg
 from maskrcnn_benchmark.data.datasets.evaluation import evaluate
 from ..utils.comm import is_main_process, get_world_size
 from ..utils.comm import all_gather
 from ..utils.comm import synchronize
 from ..utils.timer import Timer, get_time_str
+from .bbox_aug import im_detect_bbox_aug
 
 
 def compute_on_dataset(model, data_loader, device, timer=None):
@@ -19,13 +21,16 @@ def compute_on_dataset(model, data_loader, device, timer=None):
     cpu_device = torch.device("cpu")
     for _, batch in enumerate(tqdm(data_loader)):
         images, targets, image_ids = batch
-        images = images.to(device)
         with torch.no_grad():
             if timer:
                 timer.tic()
-            output = model(images)
+            if cfg.TEST.BBOX_AUG.ENABLED:
+                output = im_detect_bbox_aug(model, images, device)
+            else:
+                output = model(images.to(device))
             if timer:
-                torch.cuda.synchronize()
+                if not cfg.MODEL.DEVICE == 'cpu':
+                    torch.cuda.synchronize()
                 timer.toc()
             output = [o.to(cpu_device) for o in output]
         results_dict.update(
diff --git a/maskrcnn_benchmark/engine/trainer.py b/maskrcnn_benchmark/engine/trainer.py
index 38a9e524b..560b63e1c 100644
--- a/maskrcnn_benchmark/engine/trainer.py
+++ b/maskrcnn_benchmark/engine/trainer.py
@@ -9,6 +9,7 @@
 from maskrcnn_benchmark.utils.comm import get_world_size
 from maskrcnn_benchmark.utils.metric_logger import MetricLogger
 
+from apex import amp
 
 def reduce_loss_dict(loss_dict):
     """
@@ -54,6 +55,10 @@ def do_train(
     start_training_time = time.time()
     end = time.time()
     for iteration, (images, targets, _) in enumerate(data_loader, start_iter):
+        
+        if any(len(target) < 1 for target in targets):
+            logger.error(f"Iteration={iteration + 1} || Image Ids used for training {_} || targets Length={[len(target) for target in targets]}" )
+            continue
         data_time = time.time() - end
         iteration = iteration + 1
         arguments["iteration"] = iteration
@@ -73,7 +78,10 @@ def do_train(
         meters.update(loss=losses_reduced, **loss_dict_reduced)
 
         optimizer.zero_grad()
-        losses.backward()
+        # Note: If mixed precision is not used, this ends up doing nothing
+        # Otherwise apply loss scaling for mixed-precision recipe
+        with amp.scale_loss(losses, optimizer) as scaled_losses:
+            scaled_losses.backward()
         optimizer.step()
 
         batch_time = time.time() - end
diff --git a/maskrcnn_benchmark/layers/__init__.py b/maskrcnn_benchmark/layers/__init__.py
index bab50abae..0e1ccc513 100644
--- a/maskrcnn_benchmark/layers/__init__.py
+++ b/maskrcnn_benchmark/layers/__init__.py
@@ -3,6 +3,7 @@
 
 from .batch_norm import FrozenBatchNorm2d
 from .misc import Conv2d
+from .misc import DFConv2d
 from .misc import ConvTranspose2d
 from .misc import BatchNorm2d
 from .misc import interpolate
@@ -13,9 +14,34 @@
 from .roi_pool import roi_pool
 from .smooth_l1_loss import smooth_l1_loss
 from .sigmoid_focal_loss import SigmoidFocalLoss
+from .dcn.deform_conv_func import deform_conv, modulated_deform_conv
+from .dcn.deform_conv_module import DeformConv, ModulatedDeformConv, ModulatedDeformConvPack
+from .dcn.deform_pool_func import deform_roi_pooling
+from .dcn.deform_pool_module import DeformRoIPooling, DeformRoIPoolingPack, ModulatedDeformRoIPoolingPack
 
-__all__ = ["nms", "roi_align", "ROIAlign", "roi_pool", "ROIPool",
-           "smooth_l1_loss", "Conv2d", "ConvTranspose2d", "interpolate",
-           "BatchNorm2d", "FrozenBatchNorm2d", "SigmoidFocalLoss"
-          ]
+
+__all__ = [
+    "nms",
+    "roi_align",
+    "ROIAlign",
+    "roi_pool",
+    "ROIPool",
+    "smooth_l1_loss",
+    "Conv2d",
+    "DFConv2d",
+    "ConvTranspose2d",
+    "interpolate",
+    "BatchNorm2d",
+    "FrozenBatchNorm2d",
+    "SigmoidFocalLoss",
+    'deform_conv',
+    'modulated_deform_conv',
+    'DeformConv',
+    'ModulatedDeformConv',
+    'ModulatedDeformConvPack',
+    'deform_roi_pooling',
+    'DeformRoIPooling',
+    'DeformRoIPoolingPack',
+    'ModulatedDeformRoIPoolingPack',
+]
 
diff --git a/maskrcnn_benchmark/layers/batch_norm.py b/maskrcnn_benchmark/layers/batch_norm.py
index 903607ac3..3762e49e8 100644
--- a/maskrcnn_benchmark/layers/batch_norm.py
+++ b/maskrcnn_benchmark/layers/batch_norm.py
@@ -17,6 +17,13 @@ def __init__(self, n):
         self.register_buffer("running_var", torch.ones(n))
 
     def forward(self, x):
+        # Cast all fixed parameters to half() if necessary
+        if x.dtype == torch.float16:
+            self.weight = self.weight.half()
+            self.bias = self.bias.half()
+            self.running_mean = self.running_mean.half()
+            self.running_var = self.running_var.half()
+
         scale = self.weight * self.running_var.rsqrt()
         bias = self.bias - self.running_mean * scale
         scale = scale.reshape(1, -1, 1, 1)
diff --git a/maskrcnn_benchmark/layers/dcn/__init__.py b/maskrcnn_benchmark/layers/dcn/__init__.py
new file mode 100644
index 000000000..22fe18ff3
--- /dev/null
+++ b/maskrcnn_benchmark/layers/dcn/__init__.py
@@ -0,0 +1,3 @@
+# 
+# Copied From [mmdetection](https://github.com/open-mmlab/mmdetection/tree/master/mmdet/ops/dcn)
+# 
\ No newline at end of file
diff --git a/maskrcnn_benchmark/layers/dcn/deform_conv_func.py b/maskrcnn_benchmark/layers/dcn/deform_conv_func.py
new file mode 100644
index 000000000..a276a05fe
--- /dev/null
+++ b/maskrcnn_benchmark/layers/dcn/deform_conv_func.py
@@ -0,0 +1,262 @@
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from maskrcnn_benchmark import _C
+
+
+class DeformConvFunction(Function):
+
+    @staticmethod
+    def forward(
+        ctx, 
+        input, 
+        offset, 
+        weight,
+        stride=1, 
+        padding=0, 
+        dilation=1, 
+        groups=1, 
+        deformable_groups=1, 
+        im2col_step=64
+    ):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                "Expected 4D tensor as input, got {}D tensor instead.".format(
+                    input.dim()))
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.im2col_step = im2col_step
+
+        ctx.save_for_backward(input, offset, weight)
+
+        output = input.new_empty(
+            DeformConvFunction._output_size(input, weight, ctx.padding,
+                                            ctx.dilation, ctx.stride))
+
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+
+        if not input.is_cuda:
+            raise NotImplementedError
+        else:
+            cur_im2col_step = min(ctx.im2col_step, input.shape[0])
+            assert (input.shape[0] %
+                    cur_im2col_step) == 0, 'im2col step must divide batchsize'
+            _C.deform_conv_forward(
+                input, 
+                weight, 
+                offset, 
+                output, 
+                ctx.bufs_[0], 
+                ctx.bufs_[1],
+                weight.size(3), 
+                weight.size(2), 
+                ctx.stride[1], 
+                ctx.stride[0],
+                ctx.padding[1], 
+                ctx.padding[0], 
+                ctx.dilation[1],
+                ctx.dilation[0], 
+                ctx.groups, 
+                ctx.deformable_groups,
+                cur_im2col_step
+            )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        input, offset, weight = ctx.saved_tensors
+
+        grad_input = grad_offset = grad_weight = None
+
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        else:
+            cur_im2col_step = min(ctx.im2col_step, input.shape[0])
+            assert (input.shape[0] %
+                    cur_im2col_step) == 0, 'im2col step must divide batchsize'
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                grad_input = torch.zeros_like(input)
+                grad_offset = torch.zeros_like(offset)
+                _C.deform_conv_backward_input(
+                    input, 
+                    offset, 
+                    grad_output, 
+                    grad_input,
+                    grad_offset, 
+                    weight, 
+                    ctx.bufs_[0], 
+                    weight.size(3),
+                    weight.size(2), 
+                    ctx.stride[1], 
+                    ctx.stride[0],
+                    ctx.padding[1], 
+                    ctx.padding[0], 
+                    ctx.dilation[1],
+                    ctx.dilation[0], 
+                    ctx.groups, 
+                    ctx.deformable_groups,
+                    cur_im2col_step
+                )
+
+            if ctx.needs_input_grad[2]:
+                grad_weight = torch.zeros_like(weight)
+                _C.deform_conv_backward_parameters(
+                    input, 
+                    offset, 
+                    grad_output,
+                    grad_weight, 
+                    ctx.bufs_[0], 
+                    ctx.bufs_[1], 
+                    weight.size(3),
+                    weight.size(2), 
+                    ctx.stride[1], 
+                    ctx.stride[0],
+                    ctx.padding[1], 
+                    ctx.padding[0], 
+                    ctx.dilation[1],
+                    ctx.dilation[0], 
+                    ctx.groups, 
+                    ctx.deformable_groups, 
+                    1,
+                    cur_im2col_step
+                )
+
+        return (grad_input, grad_offset, grad_weight, None, None, None, None, None)
+
+    @staticmethod
+    def _output_size(input, weight, padding, dilation, stride):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = padding[d]
+            kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                "convolution input is too small (output would be {})".format(
+                    'x'.join(map(str, output_size))))
+        return output_size
+
+
+class ModulatedDeformConvFunction(Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        offset,
+        mask,
+        weight,
+        bias=None,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1
+    ):
+        ctx.stride = stride
+        ctx.padding = padding
+        ctx.dilation = dilation
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.with_bias = bias is not None
+        if not ctx.with_bias:
+            bias = input.new_empty(1)  # fake tensor
+        if not input.is_cuda:
+            raise NotImplementedError
+        if weight.requires_grad or mask.requires_grad or offset.requires_grad \
+                or input.requires_grad:
+            ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty(
+            ModulatedDeformConvFunction._infer_shape(ctx, input, weight))
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        _C.modulated_deform_conv_forward(
+            input, 
+            weight, 
+            bias, 
+            ctx._bufs[0], 
+            offset, 
+            mask, 
+            output,
+            ctx._bufs[1], 
+            weight.shape[2], 
+            weight.shape[3], 
+            ctx.stride,
+            ctx.stride, 
+            ctx.padding, 
+            ctx.padding, 
+            ctx.dilation, 
+            ctx.dilation,
+            ctx.groups, 
+            ctx.deformable_groups, 
+            ctx.with_bias
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        _C.modulated_deform_conv_backward(
+            input, 
+            weight, 
+            bias, 
+            ctx._bufs[0], 
+            offset, 
+            mask, 
+            ctx._bufs[1],
+            grad_input, 
+            grad_weight, 
+            grad_bias, 
+            grad_offset, 
+            grad_mask,
+            grad_output, 
+            weight.shape[2], 
+            weight.shape[3], 
+            ctx.stride,
+            ctx.stride, 
+            ctx.padding, 
+            ctx.padding, 
+            ctx.dilation, 
+            ctx.dilation,
+            ctx.groups, 
+            ctx.deformable_groups, 
+            ctx.with_bias
+        )
+        if not ctx.with_bias:
+            grad_bias = None
+
+        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
+                None, None, None, None, None)
+
+    @staticmethod
+    def _infer_shape(ctx, input, weight):
+        n = input.size(0)
+        channels_out = weight.size(0)
+        height, width = input.shape[2:4]
+        kernel_h, kernel_w = weight.shape[2:4]
+        height_out = (height + 2 * ctx.padding -
+                      (ctx.dilation * (kernel_h - 1) + 1)) // ctx.stride + 1
+        width_out = (width + 2 * ctx.padding -
+                     (ctx.dilation * (kernel_w - 1) + 1)) // ctx.stride + 1
+        return n, channels_out, height_out, width_out
+
+
+deform_conv = DeformConvFunction.apply
+modulated_deform_conv = ModulatedDeformConvFunction.apply
diff --git a/maskrcnn_benchmark/layers/dcn/deform_conv_module.py b/maskrcnn_benchmark/layers/dcn/deform_conv_module.py
new file mode 100644
index 000000000..e6b58c840
--- /dev/null
+++ b/maskrcnn_benchmark/layers/dcn/deform_conv_module.py
@@ -0,0 +1,177 @@
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn.modules.utils import _pair
+
+from .deform_conv_func import deform_conv, modulated_deform_conv
+
+
+class DeformConv(nn.Module):
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        bias=False
+    ):
+        assert not bias
+        super(DeformConv, self).__init__()
+        self.with_bias = bias
+
+        assert in_channels % groups == 0, \
+            'in_channels {} cannot be divisible by groups {}'.format(
+                in_channels, groups)
+        assert out_channels % groups == 0, \
+            'out_channels {} cannot be divisible by groups {}'.format(
+                out_channels, groups)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups,
+                         *self.kernel_size))
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+
+    def forward(self, input, offset):
+        return deform_conv(input, offset, self.weight, self.stride,
+                           self.padding, self.dilation, self.groups,
+                           self.deformable_groups)
+
+    def __repr__(self):
+        return "".join([
+            "{}(".format(self.__class__.__name__),
+            "in_channels={}, ".format(self.in_channels),
+            "out_channels={}, ".format(self.out_channels),
+            "kernel_size={}, ".format(self.kernel_size),
+            "stride={}, ".format(self.stride),
+            "dilation={}, ".format(self.dilation),
+            "padding={}, ".format(self.padding),
+            "groups={}, ".format(self.groups),
+            "deformable_groups={}, ".format(self.deformable_groups),
+            "bias={})".format(self.with_bias),
+        ])
+
+
+class ModulatedDeformConv(nn.Module):
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        deformable_groups=1,
+        bias=True
+    ):
+        super(ModulatedDeformConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.with_bias = bias
+
+        self.weight = nn.Parameter(torch.Tensor(
+            out_channels, 
+            in_channels // groups,
+            *self.kernel_size
+        ))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.zero_()
+
+    def forward(self, input, offset, mask):
+        return modulated_deform_conv(
+            input, offset, mask, self.weight, self.bias, self.stride,
+            self.padding, self.dilation, self.groups, self.deformable_groups)
+
+    def __repr__(self):
+        return "".join([
+            "{}(".format(self.__class__.__name__),
+            "in_channels={}, ".format(self.in_channels),
+            "out_channels={}, ".format(self.out_channels),
+            "kernel_size={}, ".format(self.kernel_size),
+            "stride={}, ".format(self.stride),
+            "dilation={}, ".format(self.dilation),
+            "padding={}, ".format(self.padding),
+            "groups={}, ".format(self.groups),
+            "deformable_groups={}, ".format(self.deformable_groups),
+            "bias={})".format(self.with_bias),
+        ])
+
+class ModulatedDeformConvPack(ModulatedDeformConv):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deformable_groups=1,
+                 bias=True):
+        super(ModulatedDeformConvPack, self).__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            groups, deformable_groups, bias)
+
+        self.conv_offset_mask = nn.Conv2d(
+            self.in_channels // self.groups,
+            self.deformable_groups * 3 * self.kernel_size[0] *
+            self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            bias=True)
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset_mask.weight.data.zero_()
+        self.conv_offset_mask.bias.data.zero_()
+
+    def forward(self, input):
+        out = self.conv_offset_mask(input)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        return modulated_deform_conv(
+            input, offset, mask, self.weight, self.bias, self.stride,
+            self.padding, self.dilation, self.groups, self.deformable_groups)
diff --git a/maskrcnn_benchmark/layers/dcn/deform_pool_func.py b/maskrcnn_benchmark/layers/dcn/deform_pool_func.py
new file mode 100644
index 000000000..2f7810b23
--- /dev/null
+++ b/maskrcnn_benchmark/layers/dcn/deform_pool_func.py
@@ -0,0 +1,95 @@
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from maskrcnn_benchmark import _C
+
+
+class DeformRoIPoolingFunction(Function):
+
+    @staticmethod
+    def forward(
+        ctx,
+        data,
+        rois,
+        offset,
+        spatial_scale,
+        out_size,
+        out_channels,
+        no_trans,
+        group_size=1,
+        part_size=None,
+        sample_per_part=4,
+        trans_std=.0
+    ):
+        ctx.spatial_scale = spatial_scale
+        ctx.out_size = out_size
+        ctx.out_channels = out_channels
+        ctx.no_trans = no_trans
+        ctx.group_size = group_size
+        ctx.part_size = out_size if part_size is None else part_size
+        ctx.sample_per_part = sample_per_part
+        ctx.trans_std = trans_std
+
+        assert 0.0 <= ctx.trans_std <= 1.0
+        if not data.is_cuda:
+            raise NotImplementedError
+
+        n = rois.shape[0]
+        output = data.new_empty(n, out_channels, out_size, out_size)
+        output_count = data.new_empty(n, out_channels, out_size, out_size)
+        _C.deform_psroi_pooling_forward(
+            data, 
+            rois, 
+            offset, 
+            output, 
+            output_count, 
+            ctx.no_trans,
+            ctx.spatial_scale, 
+            ctx.out_channels, 
+            ctx.group_size, 
+            ctx.out_size,
+            ctx.part_size, 
+            ctx.sample_per_part, 
+            ctx.trans_std
+        )
+
+        if data.requires_grad or rois.requires_grad or offset.requires_grad:
+            ctx.save_for_backward(data, rois, offset)
+        ctx.output_count = output_count
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError
+
+        data, rois, offset = ctx.saved_tensors
+        output_count = ctx.output_count
+        grad_input = torch.zeros_like(data)
+        grad_rois = None
+        grad_offset = torch.zeros_like(offset)
+
+        _C.deform_psroi_pooling_backward(
+            grad_output, 
+            data, 
+            rois, 
+            offset, 
+            output_count, 
+            grad_input,
+            grad_offset, 
+            ctx.no_trans, 
+            ctx.spatial_scale, 
+            ctx.out_channels,
+            ctx.group_size, 
+            ctx.out_size, 
+            ctx.part_size, 
+            ctx.sample_per_part,
+            ctx.trans_std
+        )
+        return (grad_input, grad_rois, grad_offset, None, None, None, None, None, None, None, None)
+
+
+deform_roi_pooling = DeformRoIPoolingFunction.apply
diff --git a/maskrcnn_benchmark/layers/dcn/deform_pool_module.py b/maskrcnn_benchmark/layers/dcn/deform_pool_module.py
new file mode 100644
index 000000000..bab6c2604
--- /dev/null
+++ b/maskrcnn_benchmark/layers/dcn/deform_pool_module.py
@@ -0,0 +1,150 @@
+from torch import nn
+
+from .deform_pool_func import deform_roi_pooling
+
+
+class DeformRoIPooling(nn.Module):
+
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0):
+        super(DeformRoIPooling, self).__init__()
+        self.spatial_scale = spatial_scale
+        self.out_size = out_size
+        self.out_channels = out_channels
+        self.no_trans = no_trans
+        self.group_size = group_size
+        self.part_size = out_size if part_size is None else part_size
+        self.sample_per_part = sample_per_part
+        self.trans_std = trans_std
+
+    def forward(self, data, rois, offset):
+        if self.no_trans:
+            offset = data.new_empty(0)
+        return deform_roi_pooling(
+            data, rois, offset, self.spatial_scale, self.out_size,
+            self.out_channels, self.no_trans, self.group_size, self.part_size,
+            self.sample_per_part, self.trans_std)
+
+
+class DeformRoIPoolingPack(DeformRoIPooling):
+
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0,
+                 deform_fc_channels=1024):
+        super(DeformRoIPoolingPack,
+              self).__init__(spatial_scale, out_size, out_channels, no_trans,
+                             group_size, part_size, sample_per_part, trans_std)
+
+        self.deform_fc_channels = deform_fc_channels
+
+        if not no_trans:
+            self.offset_fc = nn.Sequential(
+                nn.Linear(self.out_size * self.out_size * self.out_channels,
+                          self.deform_fc_channels),
+                nn.ReLU(inplace=True),
+                nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
+                nn.ReLU(inplace=True),
+                nn.Linear(self.deform_fc_channels,
+                          self.out_size * self.out_size * 2))
+            self.offset_fc[-1].weight.data.zero_()
+            self.offset_fc[-1].bias.data.zero_()
+
+    def forward(self, data, rois):
+        assert data.size(1) == self.out_channels
+        if self.no_trans:
+            offset = data.new_empty(0)
+            return deform_roi_pooling(
+                data, rois, offset, self.spatial_scale, self.out_size,
+                self.out_channels, self.no_trans, self.group_size,
+                self.part_size, self.sample_per_part, self.trans_std)
+        else:
+            n = rois.shape[0]
+            offset = data.new_empty(0)
+            x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                   self.out_size, self.out_channels, True,
+                                   self.group_size, self.part_size,
+                                   self.sample_per_part, self.trans_std)
+            offset = self.offset_fc(x.view(n, -1))
+            offset = offset.view(n, 2, self.out_size, self.out_size)
+            return deform_roi_pooling(
+                data, rois, offset, self.spatial_scale, self.out_size,
+                self.out_channels, self.no_trans, self.group_size,
+                self.part_size, self.sample_per_part, self.trans_std)
+
+
+class ModulatedDeformRoIPoolingPack(DeformRoIPooling):
+
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0,
+                 deform_fc_channels=1024):
+        super(ModulatedDeformRoIPoolingPack, self).__init__(
+            spatial_scale, out_size, out_channels, no_trans, group_size,
+            part_size, sample_per_part, trans_std)
+
+        self.deform_fc_channels = deform_fc_channels
+
+        if not no_trans:
+            self.offset_fc = nn.Sequential(
+                nn.Linear(self.out_size * self.out_size * self.out_channels,
+                          self.deform_fc_channels),
+                nn.ReLU(inplace=True),
+                nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
+                nn.ReLU(inplace=True),
+                nn.Linear(self.deform_fc_channels,
+                          self.out_size * self.out_size * 2))
+            self.offset_fc[-1].weight.data.zero_()
+            self.offset_fc[-1].bias.data.zero_()
+            self.mask_fc = nn.Sequential(
+                nn.Linear(self.out_size * self.out_size * self.out_channels,
+                          self.deform_fc_channels),
+                nn.ReLU(inplace=True),
+                nn.Linear(self.deform_fc_channels,
+                          self.out_size * self.out_size * 1),
+                nn.Sigmoid())
+            self.mask_fc[2].weight.data.zero_()
+            self.mask_fc[2].bias.data.zero_()
+
+    def forward(self, data, rois):
+        assert data.size(1) == self.out_channels
+        if self.no_trans:
+            offset = data.new_empty(0)
+            return deform_roi_pooling(
+                data, rois, offset, self.spatial_scale, self.out_size,
+                self.out_channels, self.no_trans, self.group_size,
+                self.part_size, self.sample_per_part, self.trans_std)
+        else:
+            n = rois.shape[0]
+            offset = data.new_empty(0)
+            x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                   self.out_size, self.out_channels, True,
+                                   self.group_size, self.part_size,
+                                   self.sample_per_part, self.trans_std)
+            offset = self.offset_fc(x.view(n, -1))
+            offset = offset.view(n, 2, self.out_size, self.out_size)
+            mask = self.mask_fc(x.view(n, -1))
+            mask = mask.view(n, 1, self.out_size, self.out_size)
+            return deform_roi_pooling(
+                data, rois, offset, self.spatial_scale, self.out_size,
+                self.out_channels, self.no_trans, self.group_size,
+                self.part_size, self.sample_per_part, self.trans_std) * mask
diff --git a/maskrcnn_benchmark/layers/misc.py b/maskrcnn_benchmark/layers/misc.py
index a8cf1c680..b64f23840 100644
--- a/maskrcnn_benchmark/layers/misc.py
+++ b/maskrcnn_benchmark/layers/misc.py
@@ -11,6 +11,7 @@
 
 import math
 import torch
+from torch import nn
 from torch.nn.modules.utils import _ntuple
 
 
@@ -108,3 +109,95 @@ def _output_size(dim):
     output_shape = tuple(_output_size(2))
     output_shape = input.shape[:-2] + output_shape
     return _NewEmptyTensorOp.apply(input, output_shape)
+
+
+class DFConv2d(nn.Module):
+    """Deformable convolutional layer"""
+    def __init__(
+        self, 
+        in_channels, 
+        out_channels, 
+        with_modulated_dcn=True, 
+        kernel_size=3, 
+        stride=1, 
+        groups=1,
+        dilation=1,
+        deformable_groups=1,
+        bias=False
+    ):
+        super(DFConv2d, self).__init__()
+        if isinstance(kernel_size, (list, tuple)):
+            assert isinstance(stride, (list, tuple))
+            assert isinstance(dilation, (list, tuple))
+            assert len(kernel_size) == 2
+            assert len(stride) == 2
+            assert len(dilation) == 2
+            padding = (
+                dilation[0] * (kernel_size[0] - 1) // 2,
+                dilation[1] * (kernel_size[1] - 1) // 2
+            )
+            offset_base_channels = kernel_size[0] * kernel_size[1]
+        else:
+            padding = dilation * (kernel_size - 1) // 2
+            offset_base_channels = kernel_size * kernel_size
+        if with_modulated_dcn:
+            from maskrcnn_benchmark.layers import ModulatedDeformConv
+            offset_channels = offset_base_channels * 3 #default: 27
+            conv_block = ModulatedDeformConv
+        else:
+            from maskrcnn_benchmark.layers import DeformConv
+            offset_channels = offset_base_channels * 2 #default: 18
+            conv_block = DeformConv
+        self.offset = Conv2d(
+            in_channels,
+            deformable_groups * offset_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=1,
+            dilation=dilation
+        )           
+        for l in [self.offset,]:
+            nn.init.kaiming_uniform_(l.weight, a=1)
+            torch.nn.init.constant_(l.bias, 0.)
+        self.conv = conv_block(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            deformable_groups=deformable_groups,
+            bias=bias
+        )
+        self.with_modulated_dcn = with_modulated_dcn
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+
+    def forward(self, x):
+        if x.numel() > 0:
+            if not self.with_modulated_dcn:
+                offset = self.offset(x)
+                x = self.conv(x, offset)
+            else:
+                offset_mask = self.offset(x)
+                offset = offset_mask[:, :18, :, :]
+                mask = offset_mask[:, -9:, :, :].sigmoid()
+                x = self.conv(x, offset, mask)
+            return x
+        # get output shape
+        output_shape = [
+            (i + 2 * p - (di * (k - 1) + 1)) // d + 1
+            for i, p, di, k, d in zip(
+                x.shape[-2:], 
+                self.padding, 
+                self.dilation, 
+                self.kernel_size, 
+                self.stride
+            )
+        ]
+        output_shape = [x.shape[0], self.conv.weight.shape[0]] + output_shape
+        return _NewEmptyTensorOp.apply(x, output_shape)
diff --git a/maskrcnn_benchmark/layers/nms.py b/maskrcnn_benchmark/layers/nms.py
index 1e80b5550..39bff82b3 100644
--- a/maskrcnn_benchmark/layers/nms.py
+++ b/maskrcnn_benchmark/layers/nms.py
@@ -2,6 +2,10 @@
 # from ._utils import _C
 from maskrcnn_benchmark import _C
 
-nms = _C.nms
+from apex import amp
+
+# Only valid with fp32 inputs - give AMP the hint
+nms = amp.float_function(_C.nms)
+
 # nms.__doc__ = """
 # This function performs Non-maximum suppresion"""
diff --git a/maskrcnn_benchmark/layers/roi_align.py b/maskrcnn_benchmark/layers/roi_align.py
index 170c8f186..ec797ed25 100644
--- a/maskrcnn_benchmark/layers/roi_align.py
+++ b/maskrcnn_benchmark/layers/roi_align.py
@@ -7,6 +7,7 @@
 
 from maskrcnn_benchmark import _C
 
+from apex import amp
 
 class _ROIAlign(Function):
     @staticmethod
@@ -46,7 +47,6 @@ def backward(ctx, grad_output):
 
 roi_align = _ROIAlign.apply
 
-
 class ROIAlign(nn.Module):
     def __init__(self, output_size, spatial_scale, sampling_ratio):
         super(ROIAlign, self).__init__()
@@ -54,6 +54,7 @@ def __init__(self, output_size, spatial_scale, sampling_ratio):
         self.spatial_scale = spatial_scale
         self.sampling_ratio = sampling_ratio
 
+    @amp.float_function
     def forward(self, input, rois):
         return roi_align(
             input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
diff --git a/maskrcnn_benchmark/layers/roi_pool.py b/maskrcnn_benchmark/layers/roi_pool.py
index c0e42756e..586339076 100644
--- a/maskrcnn_benchmark/layers/roi_pool.py
+++ b/maskrcnn_benchmark/layers/roi_pool.py
@@ -7,6 +7,7 @@
 
 from maskrcnn_benchmark import _C
 
+from apex import amp
 
 class _ROIPool(Function):
     @staticmethod
@@ -52,6 +53,7 @@ def __init__(self, output_size, spatial_scale):
         self.output_size = output_size
         self.spatial_scale = spatial_scale
 
+    @amp.float_function
     def forward(self, input, rois):
         return roi_pool(input, rois, self.output_size, self.spatial_scale)
 
diff --git a/maskrcnn_benchmark/modeling/backbone/resnet.py b/maskrcnn_benchmark/modeling/backbone/resnet.py
index 15d96720c..fc02dc1e8 100644
--- a/maskrcnn_benchmark/modeling/backbone/resnet.py
+++ b/maskrcnn_benchmark/modeling/backbone/resnet.py
@@ -24,6 +24,7 @@
 
 from maskrcnn_benchmark.layers import FrozenBatchNorm2d
 from maskrcnn_benchmark.layers import Conv2d
+from maskrcnn_benchmark.layers import DFConv2d
 from maskrcnn_benchmark.modeling.make_layers import group_norm
 from maskrcnn_benchmark.utils.registry import Registry
 
@@ -33,7 +34,7 @@
     "StageSpec",
     [
         "index",  # Index of the stage, eg 1, 2, ..,. 5
-        "block_count",  # Numer of residual blocks in the stage
+        "block_count",  # Number of residual blocks in the stage
         "return_features",  # True => return the last feature map from this stage
     ],
 )
@@ -106,6 +107,7 @@ def __init__(self, cfg):
             stage2_relative_factor = 2 ** (stage_spec.index - 1)
             bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor
             out_channels = stage2_out_channels * stage2_relative_factor
+            stage_with_dcn = cfg.MODEL.RESNETS.STAGE_WITH_DCN[stage_spec.index -1]
             module = _make_stage(
                 transformation_module,
                 in_channels,
@@ -115,6 +117,11 @@ def __init__(self, cfg):
                 num_groups,
                 cfg.MODEL.RESNETS.STRIDE_IN_1X1,
                 first_stride=int(stage_spec.index > 1) + 1,
+                dcn_config={
+                    "stage_with_dcn": stage_with_dcn,
+                    "with_modulated_dcn": cfg.MODEL.RESNETS.WITH_MODULATED_DCN,
+                    "deformable_groups": cfg.MODEL.RESNETS.DEFORMABLE_GROUPS,
+                }
             )
             in_channels = out_channels
             self.add_module(name, module)
@@ -155,7 +162,8 @@ def __init__(
         stride_in_1x1=True,
         stride_init=None,
         res2_out_channels=256,
-        dilation=1
+        dilation=1,
+        dcn_config={}
     ):
         super(ResNetHead, self).__init__()
 
@@ -182,7 +190,8 @@ def __init__(
                 num_groups,
                 stride_in_1x1,
                 first_stride=stride,
-                dilation=dilation
+                dilation=dilation,
+                dcn_config=dcn_config
             )
             stride = None
             self.add_module(name, module)
@@ -204,7 +213,8 @@ def _make_stage(
     num_groups,
     stride_in_1x1,
     first_stride,
-    dilation=1
+    dilation=1,
+    dcn_config={}
 ):
     blocks = []
     stride = first_stride
@@ -217,7 +227,8 @@ def _make_stage(
                 num_groups,
                 stride_in_1x1,
                 stride,
-                dilation=dilation
+                dilation=dilation,
+                dcn_config=dcn_config
             )
         )
         stride = 1
@@ -235,7 +246,8 @@ def __init__(
         stride_in_1x1,
         stride,
         dilation,
-        norm_func
+        norm_func,
+        dcn_config
     ):
         super(Bottleneck, self).__init__()
 
@@ -271,17 +283,34 @@ def __init__(
         )
         self.bn1 = norm_func(bottleneck_channels)
         # TODO: specify init for the above
+        with_dcn = dcn_config.get("stage_with_dcn", False)
+        if with_dcn:
+            deformable_groups = dcn_config.get("deformable_groups", 1)
+            with_modulated_dcn = dcn_config.get("with_modulated_dcn", False)
+            self.conv2 = DFConv2d(
+                bottleneck_channels, 
+                bottleneck_channels, 
+                with_modulated_dcn=with_modulated_dcn, 
+                kernel_size=3, 
+                stride=stride_3x3, 
+                groups=num_groups,
+                dilation=dilation,
+                deformable_groups=deformable_groups,
+                bias=False
+            )
+        else:
+            self.conv2 = Conv2d(
+                bottleneck_channels,
+                bottleneck_channels,
+                kernel_size=3,
+                stride=stride_3x3,
+                padding=dilation,
+                bias=False,
+                groups=num_groups,
+                dilation=dilation
+            )
+            nn.init.kaiming_uniform_(self.conv2.weight, a=1)
 
-        self.conv2 = Conv2d(
-            bottleneck_channels,
-            bottleneck_channels,
-            kernel_size=3,
-            stride=stride_3x3,
-            padding=dilation,
-            bias=False,
-            groups=num_groups,
-            dilation=dilation
-        )
         self.bn2 = norm_func(bottleneck_channels)
 
         self.conv3 = Conv2d(
@@ -289,7 +318,7 @@ def __init__(
         )
         self.bn3 = norm_func(out_channels)
 
-        for l in [self.conv1, self.conv2, self.conv3,]:
+        for l in [self.conv1, self.conv3,]:
             nn.init.kaiming_uniform_(l.weight, a=1)
 
     def forward(self, x):
@@ -346,7 +375,8 @@ def __init__(
         num_groups=1,
         stride_in_1x1=True,
         stride=1,
-        dilation=1
+        dilation=1,
+        dcn_config={}
     ):
         super(BottleneckWithFixedBatchNorm, self).__init__(
             in_channels=in_channels,
@@ -356,7 +386,8 @@ def __init__(
             stride_in_1x1=stride_in_1x1,
             stride=stride,
             dilation=dilation,
-            norm_func=FrozenBatchNorm2d
+            norm_func=FrozenBatchNorm2d,
+            dcn_config=dcn_config
         )
 
 
@@ -376,7 +407,8 @@ def __init__(
         num_groups=1,
         stride_in_1x1=True,
         stride=1,
-        dilation=1
+        dilation=1,
+        dcn_config={}
     ):
         super(BottleneckWithGN, self).__init__(
             in_channels=in_channels,
@@ -386,7 +418,8 @@ def __init__(
             stride_in_1x1=stride_in_1x1,
             stride=stride,
             dilation=dilation,
-            norm_func=group_norm
+            norm_func=group_norm,
+            dcn_config=dcn_config
         )
 
 
diff --git a/maskrcnn_benchmark/modeling/poolers.py b/maskrcnn_benchmark/modeling/poolers.py
index 9b3524d20..519440e79 100644
--- a/maskrcnn_benchmark/modeling/poolers.py
+++ b/maskrcnn_benchmark/modeling/poolers.py
@@ -116,7 +116,7 @@ def forward(self, x, boxes):
         for level, (per_level_feature, pooler) in enumerate(zip(x, self.poolers)):
             idx_in_level = torch.nonzero(levels == level).squeeze(1)
             rois_per_level = rois[idx_in_level]
-            result[idx_in_level] = pooler(per_level_feature, rois_per_level)
+            result[idx_in_level] = pooler(per_level_feature, rois_per_level).to(dtype)
 
         return result
 
diff --git a/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py b/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py
index 595a2e616..cc2f4fa85 100644
--- a/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py
@@ -22,7 +22,8 @@ def __init__(
         nms=0.5,
         detections_per_img=100,
         box_coder=None,
-        cls_agnostic_bbox_reg=False
+        cls_agnostic_bbox_reg=False,
+        bbox_aug_enabled=False
     ):
         """
         Arguments:
@@ -39,6 +40,7 @@ def __init__(
             box_coder = BoxCoder(weights=(10., 10., 5., 5.))
         self.box_coder = box_coder
         self.cls_agnostic_bbox_reg = cls_agnostic_bbox_reg
+        self.bbox_aug_enabled = bbox_aug_enabled
 
     def forward(self, x, boxes):
         """
@@ -79,7 +81,8 @@ def forward(self, x, boxes):
         ):
             boxlist = self.prepare_boxlist(boxes_per_img, prob, image_shape)
             boxlist = boxlist.clip_to_image(remove_empty=False)
-            boxlist = self.filter_results(boxlist, num_classes)
+            if not self.bbox_aug_enabled:  # If bbox aug is enabled, we will do it later
+                boxlist = self.filter_results(boxlist, num_classes)
             results.append(boxlist)
         return results
 
@@ -156,12 +159,14 @@ def make_roi_box_post_processor(cfg):
     nms_thresh = cfg.MODEL.ROI_HEADS.NMS
     detections_per_img = cfg.MODEL.ROI_HEADS.DETECTIONS_PER_IMG
     cls_agnostic_bbox_reg = cfg.MODEL.CLS_AGNOSTIC_BBOX_REG
+    bbox_aug_enabled = cfg.TEST.BBOX_AUG.ENABLED
 
     postprocessor = PostProcessor(
         score_thresh,
         nms_thresh,
         detections_per_img,
         box_coder,
-        cls_agnostic_bbox_reg
+        cls_agnostic_bbox_reg,
+        bbox_aug_enabled
     )
     return postprocessor
diff --git a/maskrcnn_benchmark/modeling/roi_heads/mask_head/inference.py b/maskrcnn_benchmark/modeling/roi_heads/mask_head/inference.py
index cd033e06c..bd831c085 100644
--- a/maskrcnn_benchmark/modeling/roi_heads/mask_head/inference.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/mask_head/inference.py
@@ -111,11 +111,16 @@ def expand_masks(mask, padding):
     pad2 = 2 * padding
     scale = float(M + pad2) / M
     padded_mask = mask.new_zeros((N, 1, M + pad2, M + pad2))
+
     padded_mask[:, :, padding:-padding, padding:-padding] = mask
     return padded_mask, scale
 
 
 def paste_mask_in_image(mask, box, im_h, im_w, thresh=0.5, padding=1):
+    # Need to work on the CPU, where fp16 isn't supported - cast to float to avoid this
+    mask = mask.float()
+    box = box.float()
+
     padded_mask, scale = expand_masks(mask[None], padding=padding)
     mask = padded_mask[0, 0]
     box = expand_boxes(box[None], scale)[0]
diff --git a/maskrcnn_benchmark/modeling/roi_heads/mask_head/loss.py b/maskrcnn_benchmark/modeling/roi_heads/mask_head/loss.py
index 36dcaa325..d4c5e3621 100644
--- a/maskrcnn_benchmark/modeling/roi_heads/mask_head/loss.py
+++ b/maskrcnn_benchmark/modeling/roi_heads/mask_head/loss.py
@@ -27,17 +27,15 @@ def project_masks_on_boxes(segmentation_masks, proposals, discretization_size):
     assert segmentation_masks.size == proposals.size, "{}, {}".format(
         segmentation_masks, proposals
     )
-    # TODO put the proposals on the CPU, as the representation for the
-    # masks is not efficient GPU-wise (possibly several small tensors for
-    # representing a single instance mask)
+
+    # FIXME: CPU computation bottleneck, this should be parallelized
     proposals = proposals.bbox.to(torch.device("cpu"))
     for segmentation_mask, proposal in zip(segmentation_masks, proposals):
         # crop the masks, resize them to the desired resolution and
-        # then convert them to the tensor representation,
-        # instead of the list representation that was used
+        # then convert them to the tensor representation.
         cropped_mask = segmentation_mask.crop(proposal)
         scaled_mask = cropped_mask.resize((M, M))
-        mask = scaled_mask.convert(mode="mask")
+        mask = scaled_mask.get_mask_tensor()
         masks.append(mask)
     if len(masks) == 0:
         return torch.empty(0, dtype=torch.float32, device=device)
diff --git a/maskrcnn_benchmark/modeling/rpn/inference.py b/maskrcnn_benchmark/modeling/rpn/inference.py
index 556082645..9fd23c5bc 100644
--- a/maskrcnn_benchmark/modeling/rpn/inference.py
+++ b/maskrcnn_benchmark/modeling/rpn/inference.py
@@ -24,6 +24,7 @@ def __init__(
         min_size,
         box_coder=None,
         fpn_post_nms_top_n=None,
+        fpn_post_nms_per_batch=True,
     ):
         """
         Arguments:
@@ -47,6 +48,7 @@ def __init__(
         if fpn_post_nms_top_n is None:
             fpn_post_nms_top_n = post_nms_top_n
         self.fpn_post_nms_top_n = fpn_post_nms_top_n
+        self.fpn_post_nms_per_batch = fpn_post_nms_per_batch
 
     def add_gt_proposals(self, proposals, targets):
         """
@@ -154,9 +156,9 @@ def select_over_all_levels(self, boxlists):
         # different behavior during training and during testing:
         # during training, post_nms_top_n is over *all* the proposals combined, while
         # during testing, it is over the proposals for each image
-        # TODO resolve this difference and make it consistent. It should be per image,
-        # and not per batch
-        if self.training:
+        # NOTE: it should be per image, and not per batch. However, to be consistent 
+        # with Detectron, the default is per batch (see Issue #672)
+        if self.training and self.fpn_post_nms_per_batch:
             objectness = torch.cat(
                 [boxlist.get_field("objectness") for boxlist in boxlists], dim=0
             )
@@ -189,6 +191,7 @@ def make_rpn_postprocessor(config, rpn_box_coder, is_train):
     if not is_train:
         pre_nms_top_n = config.MODEL.RPN.PRE_NMS_TOP_N_TEST
         post_nms_top_n = config.MODEL.RPN.POST_NMS_TOP_N_TEST
+    fpn_post_nms_per_batch = config.MODEL.RPN.FPN_POST_NMS_PER_BATCH
     nms_thresh = config.MODEL.RPN.NMS_THRESH
     min_size = config.MODEL.RPN.MIN_SIZE
     box_selector = RPNPostProcessor(
@@ -198,5 +201,6 @@ def make_rpn_postprocessor(config, rpn_box_coder, is_train):
         min_size=min_size,
         box_coder=rpn_box_coder,
         fpn_post_nms_top_n=fpn_post_nms_top_n,
+        fpn_post_nms_per_batch=fpn_post_nms_per_batch,
     )
     return box_selector
diff --git a/maskrcnn_benchmark/structures/boxlist_ops.py b/maskrcnn_benchmark/structures/boxlist_ops.py
index dc51212f4..02dcaf121 100644
--- a/maskrcnn_benchmark/structures/boxlist_ops.py
+++ b/maskrcnn_benchmark/structures/boxlist_ops.py
@@ -67,7 +67,8 @@ def boxlist_iou(boxlist1, boxlist2):
     if boxlist1.size != boxlist2.size:
         raise RuntimeError(
                 "boxlists should have same image size, got {}, {}".format(boxlist1, boxlist2))
-
+    boxlist1 = boxlist1.convert("xyxy")
+    boxlist2 = boxlist2.convert("xyxy")
     N = len(boxlist1)
     M = len(boxlist2)
 
diff --git a/maskrcnn_benchmark/structures/segmentation_mask.py b/maskrcnn_benchmark/structures/segmentation_mask.py
index ba1290b91..364d01eb5 100644
--- a/maskrcnn_benchmark/structures/segmentation_mask.py
+++ b/maskrcnn_benchmark/structures/segmentation_mask.py
@@ -1,5 +1,8 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import cv2
+import copy
 import torch
+import numpy as np
+from maskrcnn_benchmark.layers.misc import interpolate
 
 import pycocotools.mask as mask_utils
 
@@ -8,63 +11,214 @@
 FLIP_TOP_BOTTOM = 1
 
 
-class Mask(object):
+""" ABSTRACT
+Segmentations come in either:
+1) Binary masks
+2) Polygons
+
+Binary masks can be represented in a contiguous array
+and operations can be carried out more efficiently,
+therefore BinaryMaskList handles them together.
+
+Polygons are handled separately for each instance,
+by PolygonInstance and instances are handled by
+PolygonList.
+
+SegmentationList is supposed to represent both,
+therefore it wraps the functions of BinaryMaskList
+and PolygonList to make it transparent.
+"""
+
+
+class BinaryMaskList(object):
     """
-    This class is unfinished and not meant for use yet
-    It is supposed to contain the mask for an object as
-    a 2d tensor
+    This class handles binary masks for all objects in the image
     """
 
-    def __init__(self, masks, size, mode):
+    def __init__(self, masks, size):
+        """
+            Arguments:
+                masks: Either torch.tensor of [num_instances, H, W]
+                    or list of torch.tensors of [H, W] with num_instances elems,
+                    or RLE (Run Length Encoding) - interpreted as list of dicts,
+                    or BinaryMaskList.
+                size: absolute image size, width first
+
+            After initialization, a hard copy will be made, to leave the
+            initializing source data intact.
+        """
+
+        if isinstance(masks, torch.Tensor):
+            # The raw data representation is passed as argument
+            masks = masks.clone()
+        elif isinstance(masks, (list, tuple)):
+            if len(masks) == 0:
+                masks = torch.empty([0, size[1], size[0]]) # num_instances = 0!
+            elif isinstance(masks[0], torch.Tensor):
+                masks = torch.stack(masks, dim=2).clone()
+            elif isinstance(masks[0], dict) and "counts" in masks[0]:
+                # RLE interpretation
+                assert all(
+                    [(size[1], size[0]) == tuple(inst["size"]) for inst in masks]
+                )  # in RLE, height come first in "size"
+                masks = mask_utils.decode(masks)  # [h, w, n]
+                masks = torch.tensor(masks).permute(2, 0, 1)  # [n, h, w]
+            else:
+                RuntimeError(
+                    "Type of `masks[0]` could not be interpreted: %s" % type(masks)
+                )
+        elif isinstance(masks, BinaryMaskList):
+            # just hard copy the BinaryMaskList instance's underlying data
+            masks = masks.masks.clone()
+        else:
+            RuntimeError(
+                "Type of `masks` argument could not be interpreted:%s" % type(masks)
+            )
+
+        if len(masks.shape) == 2:
+            # if only a single instance mask is passed
+            masks = masks[None]
+
+        assert len(masks.shape) == 3
+        assert masks.shape[1] == size[1], "%s != %s" % (masks.shape[1], size[1])
+        assert masks.shape[2] == size[0], "%s != %s" % (masks.shape[2], size[0])
+
         self.masks = masks
-        self.size = size
-        self.mode = mode
+        self.size = tuple(size)
 
     def transpose(self, method):
-        if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM):
-            raise NotImplementedError(
-                "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented"
+        dim = 1 if method == FLIP_TOP_BOTTOM else 2
+        flipped_masks = self.masks.flip(dim)
+        return BinaryMaskList(flipped_masks, self.size)
+
+    def crop(self, box):
+        assert isinstance(box, (list, tuple, torch.Tensor)), str(type(box))
+        # box is assumed to be xyxy
+        current_width, current_height = self.size
+        xmin, ymin, xmax, ymax = [round(float(b)) for b in box]
+
+        assert xmin <= xmax and ymin <= ymax, str(box)
+        xmin = min(max(xmin, 0), current_width - 1)
+        ymin = min(max(ymin, 0), current_height - 1)
+
+        xmax = min(max(xmax, 0), current_width)
+        ymax = min(max(ymax, 0), current_height)
+
+        xmax = max(xmax, xmin + 1)
+        ymax = max(ymax, ymin + 1)
+
+        width, height = xmax - xmin, ymax - ymin
+        cropped_masks = self.masks[:, ymin:ymax, xmin:xmax]
+        cropped_size = width, height
+        return BinaryMaskList(cropped_masks, cropped_size)
+
+    def resize(self, size):
+        try:
+            iter(size)
+        except TypeError:
+            assert isinstance(size, (int, float))
+            size = size, size
+        width, height = map(int, size)
+
+        assert width > 0
+        assert height > 0
+
+        # Height comes first here!
+        resized_masks = interpolate(
+            input=self.masks[None].float(),
+            size=(height, width),
+            mode="bilinear",
+            align_corners=False,
+        )[0].type_as(self.masks)
+        resized_size = width, height
+        return BinaryMaskList(resized_masks, resized_size)
+
+    def convert_to_polygon(self):
+        if self.masks.numel() == 0:
+            return PolygonList([], self.size)
+
+        contours = self._findContours()
+        return PolygonList(contours, self.size)
+
+    def to(self, *args, **kwargs):
+        return self
+
+    def _findContours(self):
+        contours = []
+        masks = self.masks.detach().numpy()
+        for mask in masks:
+            mask = cv2.UMat(mask)
+            contour, hierarchy = cv2.findContours(
+                mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_TC89_L1
             )
 
-        width, height = self.size
-        if method == FLIP_LEFT_RIGHT:
-            dim = width
-            idx = 2
-        elif method == FLIP_TOP_BOTTOM:
-            dim = height
-            idx = 1
+            reshaped_contour = []
+            for entity in contour:
+                assert len(entity.shape) == 3
+                assert entity.shape[1] == 1, "Hierarchical contours are not allowed"
+                reshaped_contour.append(entity.reshape(-1).tolist())
+            contours.append(reshaped_contour)
+        return contours
 
-        flip_idx = list(range(dim)[::-1])
-        flipped_masks = self.masks.index_select(dim, flip_idx)
-        return Mask(flipped_masks, self.size, self.mode)
+    def __len__(self):
+        return len(self.masks)
 
-    def crop(self, box):
-        w, h = box[2] - box[0], box[3] - box[1]
+    def __getitem__(self, index):
+        if self.masks.numel() == 0:
+            raise RuntimeError("Indexing empty BinaryMaskList")
+        return BinaryMaskList(self.masks[index], self.size)
 
-        cropped_masks = self.masks[:, box[1] : box[3], box[0] : box[2]]
-        return Mask(cropped_masks, size=(w, h), mode=self.mode)
+    def __iter__(self):
+        return iter(self.masks)
 
-    def resize(self, size, *args, **kwargs):
-        pass
+    def __repr__(self):
+        s = self.__class__.__name__ + "("
+        s += "num_instances={}, ".format(len(self.masks))
+        s += "image_width={}, ".format(self.size[0])
+        s += "image_height={})".format(self.size[1])
+        return s
 
 
-class Polygons(object):
+class PolygonInstance(object):
     """
     This class holds a set of polygons that represents a single instance
     of an object mask. The object can be represented as a set of
     polygons
     """
 
-    def __init__(self, polygons, size, mode):
-        # assert isinstance(polygons, list), '{}'.format(polygons)
-        if isinstance(polygons, list):
-            polygons = [torch.as_tensor(p, dtype=torch.float32) for p in polygons]
-        elif isinstance(polygons, Polygons):
-            polygons = polygons.polygons
+    def __init__(self, polygons, size):
+        """
+            Arguments:
+                a list of lists of numbers.
+                The first level refers to all the polygons that compose the
+                object, and the second level to the polygon coordinates.
+        """
+        if isinstance(polygons, (list, tuple)):
+            valid_polygons = []
+            for p in polygons:
+                p = torch.as_tensor(p, dtype=torch.float32)
+                if len(p) >= 6:  # 3 * 2 coordinates
+                    valid_polygons.append(p)
+            polygons = valid_polygons
+
+        elif isinstance(polygons, PolygonInstance):
+            polygons = copy.copy(polygons.polygons)
+
+        else:
+            RuntimeError(
+                "Type of argument `polygons` is not allowed:%s" % (type(polygons))
+            )
+
+        """ This crashes the training way too many times...
+        for p in polygons:
+            assert p[::2].min() >= 0
+            assert p[::2].max() < size[0]
+            assert p[1::2].min() >= 0
+            assert p[1::2].max() , size[1]
+        """
 
         self.polygons = polygons
-        self.size = size
-        self.mode = mode
+        self.size = tuple(size)
 
     def transpose(self, method):
         if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM):
@@ -87,30 +241,49 @@ def transpose(self, method):
             p[idx::2] = dim - poly[idx::2] - TO_REMOVE
             flipped_polygons.append(p)
 
-        return Polygons(flipped_polygons, size=self.size, mode=self.mode)
+        return PolygonInstance(flipped_polygons, size=self.size)
 
     def crop(self, box):
-        w, h = box[2] - box[0], box[3] - box[1]
+        assert isinstance(box, (list, tuple, torch.Tensor)), str(type(box))
+
+        # box is assumed to be xyxy
+        current_width, current_height = self.size
+        xmin, ymin, xmax, ymax = map(float, box)
 
-        # TODO chck if necessary
-        w = max(w, 1)
-        h = max(h, 1)
+        assert xmin <= xmax and ymin <= ymax, str(box)
+        xmin = min(max(xmin, 0), current_width - 1)
+        ymin = min(max(ymin, 0), current_height - 1)
+
+        xmax = min(max(xmax, 0), current_width)
+        ymax = min(max(ymax, 0), current_height)
+
+        xmax = max(xmax, xmin + 1)
+        ymax = max(ymax, ymin + 1)
+
+        w, h = xmax - xmin, ymax - ymin
 
         cropped_polygons = []
         for poly in self.polygons:
             p = poly.clone()
-            p[0::2] = p[0::2] - box[0]  # .clamp(min=0, max=w)
-            p[1::2] = p[1::2] - box[1]  # .clamp(min=0, max=h)
+            p[0::2] = p[0::2] - xmin  # .clamp(min=0, max=w)
+            p[1::2] = p[1::2] - ymin  # .clamp(min=0, max=h)
             cropped_polygons.append(p)
 
-        return Polygons(cropped_polygons, size=(w, h), mode=self.mode)
+        return PolygonInstance(cropped_polygons, size=(w, h))
+
+    def resize(self, size):
+        try:
+            iter(size)
+        except TypeError:
+            assert isinstance(size, (int, float))
+            size = size, size
 
-    def resize(self, size, *args, **kwargs):
         ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size))
+
         if ratios[0] == ratios[1]:
             ratio = ratios[0]
             scaled_polys = [p * ratio for p in self.polygons]
-            return Polygons(scaled_polys, size, mode=self.mode)
+            return PolygonInstance(scaled_polys, size)
 
         ratio_w, ratio_h = ratios
         scaled_polygons = []
@@ -120,47 +293,82 @@ def resize(self, size, *args, **kwargs):
             p[1::2] *= ratio_h
             scaled_polygons.append(p)
 
-        return Polygons(scaled_polygons, size=size, mode=self.mode)
+        return PolygonInstance(scaled_polygons, size=size)
 
-    def convert(self, mode):
+    def convert_to_binarymask(self):
         width, height = self.size
-        if mode == "mask":
-            rles = mask_utils.frPyObjects(
-                [p.numpy() for p in self.polygons], height, width
-            )
-            rle = mask_utils.merge(rles)
-            mask = mask_utils.decode(rle)
-            mask = torch.from_numpy(mask)
-            # TODO add squeeze?
-            return mask
+        # formatting for COCO PythonAPI
+        polygons = [p.numpy() for p in self.polygons]
+        rles = mask_utils.frPyObjects(polygons, height, width)
+        rle = mask_utils.merge(rles)
+        mask = mask_utils.decode(rle)
+        mask = torch.from_numpy(mask)
+        return mask
+
+    def __len__(self):
+        return len(self.polygons)
 
     def __repr__(self):
         s = self.__class__.__name__ + "("
-        s += "num_polygons={}, ".format(len(self.polygons))
+        s += "num_groups={}, ".format(len(self.polygons))
         s += "image_width={}, ".format(self.size[0])
-        s += "image_height={}, ".format(self.size[1])
-        s += "mode={})".format(self.mode)
+        s += "image_height={})".format(self.size[1])
         return s
 
 
-class SegmentationMask(object):
+class PolygonList(object):
     """
-    This class stores the segmentations for all objects in the image
+    This class handles PolygonInstances for all objects in the image
     """
 
-    def __init__(self, polygons, size, mode=None):
+    def __init__(self, polygons, size):
         """
         Arguments:
-            polygons: a list of list of lists of numbers. The first
+            polygons:
+                a list of list of lists of numbers. The first
                 level of the list correspond to individual instances,
                 the second level to all the polygons that compose the
                 object, and the third level to the polygon coordinates.
+
+                OR
+
+                a list of PolygonInstances.
+
+                OR
+
+                a PolygonList
+
+            size: absolute image size
+
         """
-        assert isinstance(polygons, list)
+        if isinstance(polygons, (list, tuple)):
+            if len(polygons) == 0:
+                polygons = [[[]]]
+            if isinstance(polygons[0], (list, tuple)):
+                assert isinstance(polygons[0][0], (list, tuple)), str(
+                    type(polygons[0][0])
+                )
+            else:
+                assert isinstance(polygons[0], PolygonInstance), str(type(polygons[0]))
+
+        elif isinstance(polygons, PolygonList):
+            size = polygons.size
+            polygons = polygons.polygons
 
-        self.polygons = [Polygons(p, size, mode) for p in polygons]
-        self.size = size
-        self.mode = mode
+        else:
+            RuntimeError(
+                "Type of argument `polygons` is not allowed:%s" % (type(polygons))
+            )
+
+        assert isinstance(size, (list, tuple)), str(type(size))
+
+        self.polygons = []
+        for p in polygons:
+            p = PolygonInstance(p, size)
+            if len(p) > 0:
+                self.polygons.append(p)
+
+        self.size = tuple(size)
 
     def transpose(self, method):
         if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM):
@@ -168,30 +376,49 @@ def transpose(self, method):
                 "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented"
             )
 
-        flipped = []
+        flipped_polygons = []
         for polygon in self.polygons:
-            flipped.append(polygon.transpose(method))
-        return SegmentationMask(flipped, size=self.size, mode=self.mode)
+            flipped_polygons.append(polygon.transpose(method))
+
+        return PolygonList(flipped_polygons, size=self.size)
 
     def crop(self, box):
         w, h = box[2] - box[0], box[3] - box[1]
-        cropped = []
+        cropped_polygons = []
         for polygon in self.polygons:
-            cropped.append(polygon.crop(box))
-        return SegmentationMask(cropped, size=(w, h), mode=self.mode)
+            cropped_polygons.append(polygon.crop(box))
 
-    def resize(self, size, *args, **kwargs):
-        scaled = []
+        cropped_size = w, h
+        return PolygonList(cropped_polygons, cropped_size)
+
+    def resize(self, size):
+        resized_polygons = []
         for polygon in self.polygons:
-            scaled.append(polygon.resize(size, *args, **kwargs))
-        return SegmentationMask(scaled, size=size, mode=self.mode)
+            resized_polygons.append(polygon.resize(size))
+
+        resized_size = size
+        return PolygonList(resized_polygons, resized_size)
 
     def to(self, *args, **kwargs):
         return self
 
+    def convert_to_binarymask(self):
+        if len(self) > 0:
+            masks = torch.stack([p.convert_to_binarymask() for p in self.polygons])
+        else:
+            size = self.size
+            masks = torch.empty([0, size[1], size[0]], dtype=torch.uint8)
+
+        return BinaryMaskList(masks, size=self.size)
+
+    def __len__(self):
+        return len(self.polygons)
+
     def __getitem__(self, item):
-        if isinstance(item, (int, slice)):
+        if isinstance(item, int):
             selected_polygons = [self.polygons[item]]
+        elif isinstance(item, slice):
+            selected_polygons = self.polygons[item]
         else:
             # advanced indexing on a single dimension
             selected_polygons = []
@@ -201,7 +428,7 @@ def __getitem__(self, item):
                 item = item.tolist()
             for i in item:
                 selected_polygons.append(self.polygons[i])
-        return SegmentationMask(selected_polygons, size=self.size, mode=self.mode)
+        return PolygonList(selected_polygons, size=self.size)
 
     def __iter__(self):
         return iter(self.polygons)
@@ -212,3 +439,105 @@ def __repr__(self):
         s += "image_width={}, ".format(self.size[0])
         s += "image_height={})".format(self.size[1])
         return s
+
+
+class SegmentationMask(object):
+
+    """
+    This class stores the segmentations for all objects in the image.
+    It wraps BinaryMaskList and PolygonList conveniently.
+    """
+
+    def __init__(self, instances, size, mode="poly"):
+        """
+        Arguments:
+            instances: two types
+                (1) polygon
+                (2) binary mask
+            size: (width, height)
+            mode: 'poly', 'mask'. if mode is 'mask', convert mask of any format to binary mask
+        """
+
+        assert isinstance(size, (list, tuple))
+        assert len(size) == 2
+        if isinstance(size[0], torch.Tensor):
+            assert isinstance(size[1], torch.Tensor)
+            size = size[0].item(), size[1].item()
+
+        assert isinstance(size[0], (int, float))
+        assert isinstance(size[1], (int, float))
+
+        if mode == "poly":
+            self.instances = PolygonList(instances, size)
+        elif mode == "mask":
+            self.instances = BinaryMaskList(instances, size)
+        else:
+            raise NotImplementedError("Unknown mode: %s" % str(mode))
+
+        self.mode = mode
+        self.size = tuple(size)
+
+    def transpose(self, method):
+        flipped_instances = self.instances.transpose(method)
+        return SegmentationMask(flipped_instances, self.size, self.mode)
+
+    def crop(self, box):
+        cropped_instances = self.instances.crop(box)
+        cropped_size = cropped_instances.size
+        return SegmentationMask(cropped_instances, cropped_size, self.mode)
+
+    def resize(self, size, *args, **kwargs):
+        resized_instances = self.instances.resize(size)
+        resized_size = size
+        return SegmentationMask(resized_instances, resized_size, self.mode)
+
+    def to(self, *args, **kwargs):
+        return self
+
+    def convert(self, mode):
+        if mode == self.mode:
+            return self
+
+        if mode == "poly":
+            converted_instances = self.instances.convert_to_polygon()
+        elif mode == "mask":
+            converted_instances = self.instances.convert_to_binarymask()
+        else:
+            raise NotImplementedError("Unknown mode: %s" % str(mode))
+
+        return SegmentationMask(converted_instances, self.size, mode)
+
+    def get_mask_tensor(self):
+        instances = self.instances
+        if self.mode == "poly":
+            instances = instances.convert_to_binarymask()
+        # If there is only 1 instance
+        return instances.masks.squeeze(0)
+
+    def __len__(self):
+        return len(self.instances)
+
+    def __getitem__(self, item):
+        selected_instances = self.instances.__getitem__(item)
+        return SegmentationMask(selected_instances, self.size, self.mode)
+
+    def __iter__(self):
+        self.iter_idx = 0
+        return self
+
+    def __next__(self):
+        if self.iter_idx < self.__len__():
+            next_segmentation = self.__getitem__(self.iter_idx)
+            self.iter_idx += 1
+            return next_segmentation
+        raise StopIteration()
+
+    next = __next__  # Python 2 compatibility
+
+    def __repr__(self):
+        s = self.__class__.__name__ + "("
+        s += "num_instances={}, ".format(len(self.instances))
+        s += "image_width={}, ".format(self.size[0])
+        s += "image_height={}, ".format(self.size[1])
+        s += "mode={})".format(self.mode)
+        return s
diff --git a/maskrcnn_benchmark/utils/c2_model_loading.py b/maskrcnn_benchmark/utils/c2_model_loading.py
index b1b9996e8..cbf4c050a 100644
--- a/maskrcnn_benchmark/utils/c2_model_loading.py
+++ b/maskrcnn_benchmark/utils/c2_model_loading.py
@@ -143,6 +143,33 @@ def _load_c2_pickled_weights(file_path):
     return weights
 
 
+def _rename_conv_weights_for_deformable_conv_layers(state_dict, cfg):
+    import re
+    logger = logging.getLogger(__name__)
+    logger.info("Remapping conv weights for deformable conv weights")
+    layer_keys = sorted(state_dict.keys())
+    for ix, stage_with_dcn in enumerate(cfg.MODEL.RESNETS.STAGE_WITH_DCN, 1):
+        if not stage_with_dcn:
+            continue
+        for old_key in layer_keys:
+            pattern = ".*layer{}.*conv2.*".format(ix)
+            r = re.match(pattern, old_key)
+            if r is None:
+                continue
+            for param in ["weight", "bias"]:
+                if old_key.find(param) is -1:
+                    continue
+                new_key = old_key.replace(
+                    "conv2.{}".format(param), "conv2.conv.{}".format(param)
+                )
+                logger.info("pattern: {}, old_key: {}, new_key: {}".format(
+                    pattern, old_key, new_key
+                ))
+                state_dict[new_key] = state_dict[old_key]
+                del state_dict[old_key]
+    return state_dict
+
+
 _C2_STAGE_NAMES = {
     "R-50": ["1.2", "2.3", "3.5", "4.2"],
     "R-101": ["1.2", "2.3", "3.22", "4.2"],
@@ -168,6 +195,10 @@ def load_resnet_c2_format(cfg, f):
     arch = arch.replace("-RETINANET", "")
     stages = _C2_STAGE_NAMES[arch]
     state_dict = _rename_weights_for_resnet(state_dict, stages)
+    # ***********************************
+    # for deformable convolutional layer
+    state_dict = _rename_conv_weights_for_deformable_conv_layers(state_dict, cfg)
+    # ***********************************
     return dict(model=state_dict)
 
 
diff --git a/maskrcnn_benchmark/utils/comm.py b/maskrcnn_benchmark/utils/comm.py
index 46d7c55ce..669f208ad 100644
--- a/maskrcnn_benchmark/utils/comm.py
+++ b/maskrcnn_benchmark/utils/comm.py
@@ -63,8 +63,8 @@ def all_gather(data):
     tensor = torch.ByteTensor(storage).to("cuda")
 
     # obtain Tensor size of each rank
-    local_size = torch.IntTensor([tensor.numel()]).to("cuda")
-    size_list = [torch.IntTensor([0]).to("cuda") for _ in range(world_size)]
+    local_size = torch.LongTensor([tensor.numel()]).to("cuda")
+    size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)]
     dist.all_gather(size_list, local_size)
     size_list = [int(size.item()) for size in size_list]
     max_size = max(size_list)
diff --git a/maskrcnn_benchmark/utils/model_zoo.py b/maskrcnn_benchmark/utils/model_zoo.py
index 7a0ebb349..2128ad7bb 100644
--- a/maskrcnn_benchmark/utils/model_zoo.py
+++ b/maskrcnn_benchmark/utils/model_zoo.py
@@ -2,9 +2,14 @@
 import os
 import sys
 
-from torch.utils.model_zoo import _download_url_to_file
-from torch.utils.model_zoo import urlparse
-from torch.utils.model_zoo import HASH_REGEX
+try:
+    from torch.hub import _download_url_to_file
+    from torch.hub import urlparse
+    from torch.hub import HASH_REGEX
+except ImportError:
+    from torch.utils.model_zoo import _download_url_to_file
+    from torch.utils.model_zoo import urlparse
+    from torch.utils.model_zoo import HASH_REGEX
 
 from maskrcnn_benchmark.utils.comm import is_main_process
 from maskrcnn_benchmark.utils.comm import synchronize
@@ -30,8 +35,8 @@ def cache_url(url, model_dir=None, progress=True):
         >>> cached_file = maskrcnn_benchmark.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth')
     """
     if model_dir is None:
-        torch_home = os.path.expanduser(os.getenv('TORCH_HOME', '~/.torch'))
-        model_dir = os.getenv('TORCH_MODEL_ZOO', os.path.join(torch_home, 'models'))
+        torch_home = os.path.expanduser(os.getenv("TORCH_HOME", "~/.torch"))
+        model_dir = os.getenv("TORCH_MODEL_ZOO", os.path.join(torch_home, "models"))
     if not os.path.exists(model_dir):
         os.makedirs(model_dir)
     parts = urlparse(url)
diff --git a/setup.py b/setup.py
index bfb6845e5..837c2cd15 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,7 @@ def get_extensions():
     extra_compile_args = {"cxx": []}
     define_macros = []
 
-    if torch.cuda.is_available() and CUDA_HOME is not None:
+    if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1":
         extension = CUDAExtension
         sources += source_cuda
         define_macros += [("WITH_CUDA", None)]
diff --git a/tests/test_segmentation_mask.py b/tests/test_segmentation_mask.py
new file mode 100644
index 000000000..d01ed9452
--- /dev/null
+++ b/tests/test_segmentation_mask.py
@@ -0,0 +1,74 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+import unittest
+import torch
+from maskrcnn_benchmark.structures.segmentation_mask import SegmentationMask
+
+
+class TestSegmentationMask(unittest.TestCase):
+    def __init__(self, method_name='runTest'):
+        super(TestSegmentationMask, self).__init__(method_name)
+        poly = [[[423.0, 306.5, 406.5, 277.0, 400.0, 271.5, 389.5, 277.0,
+                  387.5, 292.0, 384.5, 295.0, 374.5, 220.0, 378.5, 210.0,
+                  391.0, 200.5, 404.0, 199.5, 414.0, 203.5, 425.5, 221.0,
+                  438.5, 297.0, 423.0, 306.5],
+                 [100, 100,     200, 100,     200, 200,     100, 200],
+                ]]
+        width = 640
+        height = 480
+        size = width, height
+
+        self.P = SegmentationMask(poly, size, 'poly')
+        self.M = SegmentationMask(poly, size, 'poly').convert('mask')
+
+
+    def L1(self, A, B):
+        diff = A.get_mask_tensor() - B.get_mask_tensor()
+        diff = torch.sum(torch.abs(diff.float())).item()
+        return diff
+
+
+    def test_convert(self):
+        M_hat = self.M.convert('poly').convert('mask')
+        P_hat = self.P.convert('mask').convert('poly')
+
+        diff_mask = self.L1(self.M, M_hat)
+        diff_poly = self.L1(self.P, P_hat)
+        self.assertTrue(diff_mask == diff_poly)
+        self.assertTrue(diff_mask <= 8169.)
+        self.assertTrue(diff_poly <= 8169.)
+
+
+    def test_crop(self):
+        box = [400, 250, 500, 300] # xyxy
+        diff = self.L1(self.M.crop(box), self.P.crop(box))
+        self.assertTrue(diff <= 1.)
+
+
+    def test_resize(self):
+        new_size = 50, 25
+        M_hat = self.M.resize(new_size)
+        P_hat = self.P.resize(new_size)
+        diff = self.L1(M_hat, P_hat)
+
+        self.assertTrue(self.M.size == self.P.size)
+        self.assertTrue(M_hat.size == P_hat.size)
+        self.assertTrue(self.M.size != M_hat.size)
+        self.assertTrue(diff <= 255.)
+
+
+    def test_transpose(self):
+        FLIP_LEFT_RIGHT = 0
+        FLIP_TOP_BOTTOM = 1
+        diff_hor = self.L1(self.M.transpose(FLIP_LEFT_RIGHT),
+                           self.P.transpose(FLIP_LEFT_RIGHT))
+
+        diff_ver = self.L1(self.M.transpose(FLIP_TOP_BOTTOM),
+                           self.P.transpose(FLIP_TOP_BOTTOM))
+
+        self.assertTrue(diff_hor <= 53250.)
+        self.assertTrue(diff_ver <= 42494.)
+
+
+if __name__ == "__main__":
+
+    unittest.main()
diff --git a/tools/test_net.py b/tools/test_net.py
index d0acd2833..c666a4655 100644
--- a/tools/test_net.py
+++ b/tools/test_net.py
@@ -17,6 +17,12 @@
 from maskrcnn_benchmark.utils.logger import setup_logger
 from maskrcnn_benchmark.utils.miscellaneous import mkdir
 
+# Check if we can enable mixed-precision via apex.amp
+try:
+    from apex import amp
+except ImportError:
+    raise ImportError('Use APEX for mixed precision via apex.amp')
+
 
 def main():
     parser = argparse.ArgumentParser(description="PyTorch Object Detection Inference")
@@ -61,6 +67,10 @@ def main():
     model = build_detection_model(cfg)
     model.to(cfg.MODEL.DEVICE)
 
+    # Initialize mixed-precision if necessary
+    use_mixed_precision = cfg.DTYPE == 'float16'
+    amp_handle = amp.init(enabled=use_mixed_precision, verbose=cfg.AMP_VERBOSE)
+
     output_dir = cfg.OUTPUT_DIR
     checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
     _ = checkpointer.load(cfg.MODEL.WEIGHT)
diff --git a/tools/train_net.py b/tools/train_net.py
index e4f95f015..9f4761b3f 100644
--- a/tools/train_net.py
+++ b/tools/train_net.py
@@ -25,6 +25,13 @@
 from maskrcnn_benchmark.utils.logger import setup_logger
 from maskrcnn_benchmark.utils.miscellaneous import mkdir
 
+# See if we can use apex.DistributedDataParallel instead of the torch default,
+# and enable mixed-precision via apex.amp
+try:
+    from apex import amp
+except ImportError:
+    raise ImportError('Use APEX for multi-precision via apex.amp')
+
 
 def train(cfg, local_rank, distributed):
     model = build_detection_model(cfg)
@@ -34,6 +41,11 @@ def train(cfg, local_rank, distributed):
     optimizer = make_optimizer(cfg, model)
     scheduler = make_lr_scheduler(cfg, optimizer)
 
+    # Initialize mixed-precision training
+    use_mixed_precision = cfg.DTYPE == "float16"
+    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
+    model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level)
+
     if distributed:
         model = torch.nn.parallel.DistributedDataParallel(
             model, device_ids=[local_rank], output_device=local_rank,