Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Object Detection with Gaudi2 #2608

Merged
merged 16 commits into from
Nov 10, 2023
2 changes: 1 addition & 1 deletion src/otx/algorithms/common/adapters/mmcv/configurer.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def configure_device(self, cfg):
elif "gpu_ids" not in cfg:
cfg.gpu_ids = range(1)

# consider "cuda", "hpu" and "cpu" device only
# consider "cuda", "xpu", "hpu" and "cpu" device only
if is_hpu_available():
cfg.device = "hpu"
elif torch.cuda.is_available():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from torch.nn import LayerNorm

from otx.algorithms.classification import MMCLS_AVAILABLE
from otx.algorithms.common.utils.utils import cast_bf16_to_fp32

if MMCLS_AVAILABLE:
from mmcls.models.necks.gap import GlobalAveragePooling
Expand Down Expand Up @@ -74,7 +75,7 @@ def _recording_forward(
): # pylint: disable=unused-argument
tensors = self.func(output)
if isinstance(tensors, torch.Tensor):
tensors_np = tensors.detach().cpu().numpy()
tensors_np = cast_bf16_to_fp32(tensors).detach().cpu().numpy()
elif isinstance(tensors, np.ndarray):
tensors_np = tensors
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def build_data_parallel(
model = model.xpu()
model = XPUDataParallel(model, device_ids=config.gpu_ids)
elif is_hpu_available() and config.get("gpu_ids", []):
model = model.hpu()
model = model.to("hpu")
eunwoosh marked this conversation as resolved.
Show resolved Hide resolved
model = HPUDataParallel(model, device_ids=config.gpu_ids)
elif torch.cuda.is_available() and config.get("gpu_ids", []):
if distributed:
Expand Down Expand Up @@ -140,9 +140,10 @@ def val_step(self, *inputs, **kwargs):


class HPUDataParallel(MMDataParallel):
def __init__(self, *args, enable_autocast: bool = False, **kwargs):
def __init__(self, *args, enable_autocast: bool = False, put_gt_on_device=True, **kwargs):
super().__init__(*args, **kwargs)
self.enable_autocast = enable_autocast
self.put_gt_on_device = put_gt_on_device
self.src_device_obj = torch.device("hpu", self.device_ids[0])

def scatter(self, inputs, kwargs, device_ids):
Expand All @@ -153,6 +154,8 @@ def scatter(self, inputs, kwargs, device_ids):
for val in x:
if isinstance(val, dict):
for k in val:
if not self.put_gt_on_device and k.startswith("gt_"):
kprokofi marked this conversation as resolved.
Show resolved Hide resolved
continue
kprokofi marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(val[k], torch.Tensor):
val[k] = val[k].to(self.src_device_obj)
elif isinstance(val[k], list):
Expand Down
39 changes: 38 additions & 1 deletion src/otx/algorithms/detection/adapters/mmdet/apis/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@
from torchvision.ops import nms as tv_nms
from torchvision.ops import roi_align as tv_roi_align

from otx.algorithms.common.adapters.mmcv.utils import XPUDataParallel
from otx.algorithms.common.adapters.mmcv.utils import HPUDataParallel, XPUDataParallel
from otx.algorithms.common.adapters.mmcv.utils.hpu_optimizers import HABANA_OPTIMIZERS

ext_module = ext_loader.load_ext("_ext", ["nms", "softnms", "nms_match", "nms_rotated", "nms_quadri"])
dp_factory["xpu"] = XPUDataParallel
dp_factory["hpu"] = HPUDataParallel


def auto_scale_lr(cfg, distributed, logger):
Expand Down Expand Up @@ -119,11 +121,26 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
elif cfg.device == "xpu":
model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids, enable_autocast=bool(fp16_cfg))
model.to(f"xpu:{cfg.gpu_ids[0]}")
elif cfg.device == "hpu":
import habana_frameworks.torch.core as htcore
from habana_frameworks.torch.utils.library_loader import load_habana_module
kprokofi marked this conversation as resolved.
Show resolved Hide resolved

load_habana_module()
os.environ["PT_HPU_LAZY_MODE"] = "1"
kprokofi marked this conversation as resolved.
Show resolved Hide resolved
kprokofi marked this conversation as resolved.
Show resolved Hide resolved
assert len(cfg.gpu_ids) == 1
model = build_dp(
model, cfg.device, device_ids=cfg.gpu_ids, dim=0, enable_autocast=bool(fp16_cfg), put_gt_on_device=False
)
model.to(model.src_device_obj)
kprokofi marked this conversation as resolved.
Show resolved Hide resolved
htcore.mark_step()
kprokofi marked this conversation as resolved.
Show resolved Hide resolved
model.zero_grad()
kprokofi marked this conversation as resolved.
Show resolved Hide resolved
else:
model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids)

# build optimizer
auto_scale_lr(cfg, distributed, logger)
if cfg.device == "hpu":
cfg.optimizer = patch_optimizer(cfg.optimizer)
optimizer = build_optimizer(model, cfg.optimizer)

if cfg.device == "xpu":
Expand All @@ -137,6 +154,14 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
model.train()
model, optimizer = torch.xpu.optimize(model, optimizer=optimizer, dtype=dtype)

if cfg.device == "hpu":
NMSop.forward = monkey_patched_xpu_nms
RoIAlign.forward = monkey_patched_xpu_roi_align
kprokofi marked this conversation as resolved.
Show resolved Hide resolved
# build runner
if cfg.device == "hpu":
if (new_type := "Fused" + cfg.optimizer.get("type", "SGD")) in HABANA_OPTIMIZERS:
cfg.optimizer["type"] = new_type
kprokofi marked this conversation as resolved.
Show resolved Hide resolved

runner = build_runner(
cfg.runner, default_args=dict(model=model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)
)
Expand Down Expand Up @@ -198,6 +223,18 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times
runner.run(data_loaders, cfg.workflow)


def patch_optimizer(cfg_optim):
"""Patch optimizer for OD and IS."""
if cfg_optim["type"] == "SGD":
return cfg_optim

# Only SGD for OD and IS supported by now on HPU
cfg_optim["type"] = "SGD"
if "betas" in cfg_optim:
del cfg_optim["betas"]
return cfg_optim


def monkey_patched_xpu_nms(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num):
kprokofi marked this conversation as resolved.
Show resolved Hide resolved
"""Runs MMCVs NMS with torchvision.nms, or forces NMS from MMCV to run on CPU."""
is_filtering_by_score = score_threshold > 0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@
# and limitations under the License.

import multiprocessing as mp
import time
from typing import Dict, List, Tuple, Union

import mmcv
import numpy as np
import pycocotools.mask as mask_util
from mmcv.utils import print_log
from mmdet.core import BitmapMasks, PolygonMasks, eval_map
from mmdet.core.evaluation import mean_ap
from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
from mmdet.core.evaluation.class_names import get_classes
from mmdet.core.evaluation.mean_ap import average_precision
Expand Down Expand Up @@ -59,6 +61,7 @@ def print_map_summary( # pylint: disable=too-many-locals,too-many-branches
if scale_ranges is not None:
assert len(scale_ranges) == num_scales

segmentation = "miou" in results
kprokofi marked this conversation as resolved.
Show resolved Hide resolved
num_classes = len(results)

recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
Expand All @@ -69,7 +72,8 @@ def print_map_summary( # pylint: disable=too-many-locals,too-many-branches
if cls_result["recall"].size > 0:
recalls[:, i] = np.array(cls_result["recall"], ndmin=2)[:, -1]
aps[:, i] = cls_result["ap"]
mious[:, i] = cls_result["miou"]
if segmentation:
mious[:, i] = cls_result["miou"]
num_gts[:, i] = cls_result["num_gts"]

if dataset is None:
Expand All @@ -82,7 +86,9 @@ def print_map_summary( # pylint: disable=too-many-locals,too-many-branches
if not isinstance(mean_ap, list):
mean_ap = [mean_ap]

header = ["class", "gts", "dets", "recall", "ap", "miou"]
header = ["class", "gts", "dets", "recall", "ap"]
if segmentation:
header.append("miou")
for i in range(num_scales):
if scale_ranges is not None:
print_log(f"Scale range {scale_ranges[i]}", logger=logger)
Expand All @@ -94,12 +100,19 @@ def print_map_summary( # pylint: disable=too-many-locals,too-many-branches
results[j]["num_dets"],
f"{recalls[i, j]:.3f}",
f"{aps[i, j]:.3f}",
f"{mious[i, j]:.3f}",
]
if segmentation:
row_data.append(f"{mious[i, j]:.3f}")
table_data.append(row_data)
table_data.append(["mAP", "", "", "", f"{mean_ap[i]:.3f}", f"{np.mean(mious[i]):.3f}"])
table_ = (
["mAP", "", "", "", f"{mean_ap[i]:.3f}", f"{np.mean(mious[i]):.3f}"]
if segmentation
else ["mAP", "", "", "", f"{mean_ap[i]:.3f}"]
)
table_data.append(table_)
table = AsciiTable(table_data)
table.inner_footing_row_border = True
time.sleep(0.1) # prevent segmentation fault
kprokofi marked this conversation as resolved.
Show resolved Hide resolved
print_log("\n" + table.table, logger=logger)


Expand Down Expand Up @@ -245,6 +258,7 @@ def __init__(self, annotation: List[Dict], domain: Domain, classes: List[str], n
else:
self.annotation = annotation
self.nproc = nproc
mean_ap.print_map_summary = print_map_summary

def get_gt_instance_masks(self, annotation: List[Dict]):
"""Format ground truth instance mask annotation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ class CustomSingleStageDetector(SAMDetectorMixin, DetLossDynamicsTrackingMixin,

def __init__(self, *args, task_adapt=None, **kwargs):
super().__init__(*args, **kwargs)

# Hook for class-sensitive weight loading
if task_adapt:
self._register_load_state_dict_pre_hook(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .custom_fcn_mask_head import CustomFCNMaskHead
from .custom_retina_head import CustomRetinaHead
from .custom_roi_head import CustomRoIHead
from .custom_rpn_head import CustomRPNHead
from .custom_ssd_head import CustomSSDHead
from .custom_vfnet_head import CustomVFNetHead
from .custom_yolox_head import CustomYOLOXHead
Expand All @@ -27,6 +28,7 @@
"CustomVFNetHead",
"CustomYOLOXHead",
"DETRHeadExtension",
"CustomRPNHead",
# Loss dynamics tracking
"CustomATSSHeadTrackingLossDynamics",
]
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,32 @@ def __init__(self, *args, bg_loss_weight=-1.0, use_qfl=False, qfl_cfg=None, **kw
self.bg_loss_weight = bg_loss_weight
self.use_qfl = use_qfl

def forward_single(self, x, scale):
"""Forward feature of a single scale level.

Args:
x (Tensor): Features of a single scale level.
scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize
the bbox prediction.

Returns:
tuple:
cls_score (Tensor): Cls scores for a single scale level
the channels number is num_anchors * num_classes.
bbox_pred (Tensor): Box energies / deltas for a single scale
level, the channels number is num_anchors * 4.
centerness (Tensor): Centerness for a single scale level, the
channel number is (N, num_anchors * 1, H, W).
"""
cls_score, bbox_pred, centerness = super().forward_single(x, scale)
if cls_score.device.type == "hpu":
# put further post-processing on cpu
cls_score = cls_score.cpu()
bbox_pred = bbox_pred.cpu()
centerness = centerness.cpu()

return cls_score, bbox_pred, centerness

@force_fp32(apply_to=("cls_scores", "bbox_preds", "centernesses"))
def loss(self, cls_scores, bbox_preds, centernesses, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None):
"""Compute losses of the head.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels, img_met
bbox_results.update(loss_bbox=loss_bbox)
return bbox_results

def _mask_forward(self, x, rois=None, pos_inds=None, bbox_feats=None):
"""Mask head forward function used in both training and testing."""
mask_results = super()._mask_forward(x, rois, pos_inds, bbox_feats)
if mask_results["mask_pred"].device.type == "hpu":
mask_results["mask_pred"] = mask_results["mask_pred"].cpu()
mask_results["mask_feats"] = mask_results["mask_feats"].cpu()
return mask_results


@HEADS.register_module()
class CustomConvFCBBoxHead(Shared2FCBBoxHead, CrossDatasetDetectorHead):
Expand Down Expand Up @@ -125,6 +133,16 @@ def get_targets(self, sampling_results, gt_bboxes, gt_labels, img_metas, rcnn_tr
valid_label_mask = torch.cat(valid_label_mask, 0)
return labels, label_weights, bbox_targets, bbox_weights, valid_label_mask

def forward(self, x):
"""ConvFCBBoxHead forward."""
# shared part
cls_score, bbox_pred = super().forward(x)
if cls_score.device.type == "hpu":
cls_score = cls_score.cpu()
bbox_pred = bbox_pred.cpu()

return cls_score, bbox_pred

@force_fp32(apply_to=("cls_score", "bbox_pred"))
def loss(
self,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""Custom RPN head for OTX template."""
# Copyright (C) 2022 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
#
from mmdet.models.builder import HEADS
from mmdet.models.dense_heads import RPNHead


@HEADS.register_module()
class CustomRPNHead(RPNHead):
"""RPN head.

Args:
in_channels (int): Number of channels in the input feature map.
init_cfg (dict or list[dict], optional): Initialization config dict.
num_convs (int): Number of convolution layers in the head. Default 1.
"""

def forward_single(self, x):
"""Forward feature map of a single scale level."""
rpn_cls_score, rpn_bbox_pred = super().forward_single(x)
if rpn_cls_score.device.type == "hpu":
rpn_cls_score = rpn_cls_score.cpu()
rpn_bbox_pred = rpn_bbox_pred.cpu()
return rpn_cls_score, rpn_bbox_pred
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,35 @@ def _init_layers(self):
nn.Conv2d(in_channel, num_base_priors * self.cls_out_channels, kernel_size=3, padding=1)
)

def forward(self, feats):
"""Forward features from the upstream network.

Args:
feats (tuple[Tensor]): Features from the upstream network, each is
a 4D-tensor.

Returns:
tuple:
cls_scores (list[Tensor]): Classification scores for all scale
levels, each is a 4D-tensor, the channels number is
num_anchors * num_classes.
bbox_preds (list[Tensor]): Box energies / deltas for all scale
levels, each is a 4D-tensor, the channels number is
num_anchors * 4.
"""
cls_scores = []
bbox_preds = []
for feat, reg_conv, cls_conv in zip(feats, self.reg_convs, self.cls_convs):
cls_out = cls_conv(feat)
reg_out = reg_conv(feat)
if cls_out.device.type == "hpu":
cls_scores.append(cls_out.cpu())
bbox_preds.append(reg_out.cpu())
else:
cls_scores.append(cls_out)
bbox_preds.append(reg_out)
return cls_scores, bbox_preds

def loss_single(
self,
cls_score,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,16 @@ def loss(self, cls_scores, bbox_preds, objectnesses, gt_bboxes, gt_labels, img_m

return loss_dict

def forward_single(self, x, cls_convs, reg_convs, conv_cls, conv_reg, conv_obj):
"""Forward feature of a single scale level."""
cls_score, bbox_pred, objectness = super().forward_single(x, cls_convs, reg_convs, conv_cls, conv_reg, conv_obj)
if cls_score.device.type == "hpu":
# put on cpu for further post-processing
cls_score = cls_score.cpu()
bbox_pred = bbox_pred.cpu()
objectness = objectness.cpu()
return cls_score, bbox_pred, objectness


@HEADS.register_module()
class CustomYOLOXHeadTrackingLossDynamics(TrackingLossDynamicsMixIn, CustomYOLOXHead):
Expand Down Expand Up @@ -245,6 +255,7 @@ def _get_target_single(self, cls_preds, objectness, priors, decoded_bboxes, gt_b
num_priors = priors.size(0)
num_gts = gt_labels.size(0)
gt_bboxes = gt_bboxes.to(decoded_bboxes.dtype)

# No target
if num_gts == 0:
cls_target = cls_preds.new_zeros((0, self.num_classes))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
type="CustomMaskRCNN", # Use CustomMaskRCNN for Incremental Learning
neck=dict(type="FPN", in_channels=[24, 48, 120, 352], out_channels=80, num_outs=5),
rpn_head=dict(
type="RPNHead",
type="CustomRPNHead",
in_channels=80,
feat_channels=80,
anchor_generator=dict(type="AnchorGenerator", scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
),
neck=dict(type="FPN", in_channels=[96, 192, 384, 768], out_channels=256, num_outs=5),
rpn_head=dict(
type="RPNHead",
type="CustomRPNHead",
in_channels=256,
feat_channels=256,
anchor_generator=dict(type="AnchorGenerator", scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]),
Expand Down
Loading