diff --git a/src/otx/algorithms/common/adapters/mmcv/configurer.py b/src/otx/algorithms/common/adapters/mmcv/configurer.py index 54ee9326b80..83d43192ed7 100644 --- a/src/otx/algorithms/common/adapters/mmcv/configurer.py +++ b/src/otx/algorithms/common/adapters/mmcv/configurer.py @@ -176,7 +176,7 @@ def configure_device(self, cfg): elif "gpu_ids" not in cfg: cfg.gpu_ids = range(1) - # consider "cuda", "hpu" and "cpu" device only + # consider "cuda", "xpu", "hpu" and "cpu" device only if is_hpu_available(): cfg.device = "hpu" elif torch.cuda.is_available(): diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py b/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py index 062cc230367..a3b2698babb 100644 --- a/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py +++ b/src/otx/algorithms/common/adapters/mmcv/hooks/recording_forward_hook.py @@ -23,6 +23,7 @@ from torch.nn import LayerNorm from otx.algorithms.classification import MMCLS_AVAILABLE +from otx.algorithms.common.utils.utils import cast_bf16_to_fp32 if MMCLS_AVAILABLE: from mmcls.models.necks.gap import GlobalAveragePooling @@ -74,7 +75,7 @@ def _recording_forward( ): # pylint: disable=unused-argument tensors = self.func(output) if isinstance(tensors, torch.Tensor): - tensors_np = tensors.detach().cpu().numpy() + tensors_np = cast_bf16_to_fp32(tensors).detach().cpu().numpy() elif isinstance(tensors, np.ndarray): tensors_np = tensors else: diff --git a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py index 39c9bf5f7b3..226e5e8cc25 100644 --- a/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py +++ b/src/otx/algorithms/common/adapters/mmcv/utils/_builder_build_data_parallel.py @@ -64,7 +64,7 @@ def build_data_parallel( model = model.xpu() model = XPUDataParallel(model, device_ids=config.gpu_ids) elif is_hpu_available() and config.get("gpu_ids", []): - model = model.hpu() + model = model.to("hpu") model = HPUDataParallel(model, device_ids=config.gpu_ids) elif torch.cuda.is_available() and config.get("gpu_ids", []): if distributed: @@ -140,9 +140,10 @@ def val_step(self, *inputs, **kwargs): class HPUDataParallel(MMDataParallel): - def __init__(self, *args, enable_autocast: bool = False, **kwargs): + def __init__(self, *args, enable_autocast: bool = False, put_gt_on_device=True, **kwargs): super().__init__(*args, **kwargs) self.enable_autocast = enable_autocast + self.put_gt_on_device = put_gt_on_device self.src_device_obj = torch.device("hpu", self.device_ids[0]) def scatter(self, inputs, kwargs, device_ids): @@ -153,6 +154,10 @@ def scatter(self, inputs, kwargs, device_ids): for val in x: if isinstance(val, dict): for k in val: + # don't put annotations on the HPU to proceed + # post-processing on the CPU + if not self.put_gt_on_device and k.startswith("gt_"): + continue if isinstance(val[k], torch.Tensor): val[k] = val[k].to(self.src_device_obj) elif isinstance(val[k], list): diff --git a/src/otx/algorithms/common/utils/__init__.py b/src/otx/algorithms/common/utils/__init__.py index 6395bd6e60d..80372c59b4b 100644 --- a/src/otx/algorithms/common/utils/__init__.py +++ b/src/otx/algorithms/common/utils/__init__.py @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions # and limitations under the License. +import os + from .callback import ( InferenceProgressCallback, OptimizationProgressCallback, @@ -59,4 +61,5 @@ if is_hpu_available(): + os.environ["PT_HPU_LAZY_MODE"] = "1" import habana_frameworks.torch.gpu_migration # noqa: F401 diff --git a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py index caf8720b59a..4565631880d 100644 --- a/src/otx/algorithms/detection/adapters/mmdet/apis/train.py +++ b/src/otx/algorithms/detection/adapters/mmdet/apis/train.py @@ -24,10 +24,12 @@ from torchvision.ops import nms as tv_nms from torchvision.ops import roi_align as tv_roi_align -from otx.algorithms.common.adapters.mmcv.utils import XPUDataParallel +from otx.algorithms.common.adapters.mmcv.utils import HPUDataParallel, XPUDataParallel +from otx.algorithms.common.adapters.mmcv.utils.hpu_optimizers import HABANA_OPTIMIZERS ext_module = ext_loader.load_ext("_ext", ["nms", "softnms", "nms_match", "nms_rotated", "nms_quadri"]) dp_factory["xpu"] = XPUDataParallel +dp_factory["hpu"] = HPUDataParallel def auto_scale_lr(cfg, distributed, logger): @@ -119,17 +121,27 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times elif cfg.device == "xpu": model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids, enable_autocast=bool(fp16_cfg)) model.to(f"xpu:{cfg.gpu_ids[0]}") + elif cfg.device == "hpu": + model = build_dp( + model, cfg.device, device_ids=cfg.gpu_ids, dim=0, enable_autocast=bool(fp16_cfg), put_gt_on_device=False + ) + # patch optimizer + if (new_type := "Fused" + cfg.optimizer.get("type", "SGD")) in HABANA_OPTIMIZERS: + cfg.optimizer["type"] = new_type else: model = build_dp(model, cfg.device, device_ids=cfg.gpu_ids) # build optimizer auto_scale_lr(cfg, distributed, logger) + + if cfg.device in ["hpu", "xpu"]: + # dynamic patch for nms and roi_align + NMSop.forward = monkey_patched_nms + RoIAlign.forward = monkey_patched_roi_align + optimizer = build_optimizer(model, cfg.optimizer) if cfg.device == "xpu": - # dynamic patch for nms and roi_align - NMSop.forward = monkey_patched_xpu_nms - RoIAlign.forward = monkey_patched_xpu_roi_align if fp16_cfg is not None: dtype = torch.bfloat16 else: @@ -198,7 +210,7 @@ def train_detector(model, dataset, cfg, distributed=False, validate=False, times runner.run(data_loaders, cfg.workflow) -def monkey_patched_xpu_nms(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num): +def monkey_patched_nms(ctx, bboxes, scores, iou_threshold, offset, score_threshold, max_num): """Runs MMCVs NMS with torchvision.nms, or forces NMS from MMCV to run on CPU.""" is_filtering_by_score = score_threshold > 0 if is_filtering_by_score: @@ -228,7 +240,7 @@ def monkey_patched_xpu_nms(ctx, bboxes, scores, iou_threshold, offset, score_thr return inds -def monkey_patched_xpu_roi_align(self, input, rois): +def monkey_patched_roi_align(self, input, rois): """Replaces MMCVs roi align with the one from torchvision. Args: diff --git a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py index b6e5e6ab2dd..36bda12206f 100644 --- a/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py +++ b/src/otx/algorithms/detection/adapters/mmdet/evaluation/evaluator.py @@ -15,6 +15,7 @@ # and limitations under the License. import multiprocessing as mp +import time from typing import Dict, List, Tuple, Union import mmcv @@ -22,11 +23,13 @@ import pycocotools.mask as mask_util from mmcv.utils import print_log from mmdet.core import BitmapMasks, PolygonMasks, eval_map +from mmdet.core.evaluation import mean_ap from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps from mmdet.core.evaluation.class_names import get_classes from mmdet.core.evaluation.mean_ap import average_precision from terminaltables import AsciiTable +from otx.algorithms.common.utils.utils import is_hpu_available from otx.api.entities.label import Domain from otx.api.utils.time_utils import timeit @@ -59,6 +62,7 @@ def print_map_summary( # pylint: disable=too-many-locals,too-many-branches if scale_ranges is not None: assert len(scale_ranges) == num_scales + segmentation = "miou" in results num_classes = len(results) recalls = np.zeros((num_scales, num_classes), dtype=np.float32) @@ -69,7 +73,8 @@ def print_map_summary( # pylint: disable=too-many-locals,too-many-branches if cls_result["recall"].size > 0: recalls[:, i] = np.array(cls_result["recall"], ndmin=2)[:, -1] aps[:, i] = cls_result["ap"] - mious[:, i] = cls_result["miou"] + if segmentation: + mious[:, i] = cls_result["miou"] num_gts[:, i] = cls_result["num_gts"] if dataset is None: @@ -82,7 +87,9 @@ def print_map_summary( # pylint: disable=too-many-locals,too-many-branches if not isinstance(mean_ap, list): mean_ap = [mean_ap] - header = ["class", "gts", "dets", "recall", "ap", "miou"] + header = ["class", "gts", "dets", "recall", "ap"] + if segmentation: + header.append("miou") for i in range(num_scales): if scale_ranges is not None: print_log(f"Scale range {scale_ranges[i]}", logger=logger) @@ -94,12 +101,20 @@ def print_map_summary( # pylint: disable=too-many-locals,too-many-branches results[j]["num_dets"], f"{recalls[i, j]:.3f}", f"{aps[i, j]:.3f}", - f"{mious[i, j]:.3f}", ] + if segmentation: + row_data.append(f"{mious[i, j]:.3f}") table_data.append(row_data) - table_data.append(["mAP", "", "", "", f"{mean_ap[i]:.3f}", f"{np.mean(mious[i]):.3f}"]) + table_ = ( + ["mAP", "", "", "", f"{mean_ap[i]:.3f}", f"{np.mean(mious[i]):.3f}"] + if segmentation + else ["mAP", "", "", "", f"{mean_ap[i]:.3f}"] + ) + table_data.append(table_) table = AsciiTable(table_data) table.inner_footing_row_border = True + if is_hpu_available(): + time.sleep(0.1) # prevent segmentation fault print_log("\n" + table.table, logger=logger) @@ -245,6 +260,7 @@ def __init__(self, annotation: List[Dict], domain: Domain, classes: List[str], n else: self.annotation = annotation self.nproc = nproc + mean_ap.print_map_summary = print_map_summary def get_gt_instance_masks(self, annotation: List[Dict]): """Format ground truth instance mask annotation. diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py index f690c38d86b..a8e926cae5d 100644 --- a/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py +++ b/src/otx/algorithms/detection/adapters/mmdet/models/detectors/custom_single_stage_detector.py @@ -38,7 +38,6 @@ class CustomSingleStageDetector(SAMDetectorMixin, DetLossDynamicsTrackingMixin, def __init__(self, *args, task_adapt=None, **kwargs): super().__init__(*args, **kwargs) - # Hook for class-sensitive weight loading if task_adapt: self._register_load_state_dict_pre_hook( diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py index 28da39d0a1b..e705d18bdc8 100644 --- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py +++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/__init__.py @@ -10,6 +10,7 @@ from .custom_fcn_mask_head import CustomFCNMaskHead from .custom_retina_head import CustomRetinaHead from .custom_roi_head import CustomRoIHead +from .custom_rpn_head import CustomRPNHead from .custom_ssd_head import CustomSSDHead from .custom_vfnet_head import CustomVFNetHead from .custom_yolox_head import CustomYOLOXHead @@ -27,6 +28,7 @@ "CustomVFNetHead", "CustomYOLOXHead", "DETRHeadExtension", + "CustomRPNHead", # Loss dynamics tracking "CustomATSSHeadTrackingLossDynamics", ] diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py index 477790e0d4d..41b7fd3aa8b 100644 --- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py +++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_atss_head.py @@ -49,6 +49,32 @@ def __init__(self, *args, bg_loss_weight=-1.0, use_qfl=False, qfl_cfg=None, **kw self.bg_loss_weight = bg_loss_weight self.use_qfl = use_qfl + def forward_single(self, x, scale): + """Forward feature of a single scale level. + + Args: + x (Tensor): Features of a single scale level. + scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize + the bbox prediction. + + Returns: + tuple: + cls_score (Tensor): Cls scores for a single scale level + the channels number is num_anchors * num_classes. + bbox_pred (Tensor): Box energies / deltas for a single scale + level, the channels number is num_anchors * 4. + centerness (Tensor): Centerness for a single scale level, the + channel number is (N, num_anchors * 1, H, W). + """ + cls_score, bbox_pred, centerness = super().forward_single(x, scale) + if cls_score.device.type == "hpu": + # put further post-processing on cpu + cls_score = cls_score.cpu() + bbox_pred = bbox_pred.cpu() + centerness = centerness.cpu() + + return cls_score, bbox_pred, centerness + @force_fp32(apply_to=("cls_scores", "bbox_preds", "centernesses")) def loss(self, cls_scores, bbox_preds, centernesses, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None): """Compute losses of the head. diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py index 05902fc9e70..d8e546a5f91 100644 --- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py +++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_roi_head.py @@ -54,6 +54,14 @@ def _bbox_forward_train(self, x, sampling_results, gt_bboxes, gt_labels, img_met bbox_results.update(loss_bbox=loss_bbox) return bbox_results + def _mask_forward(self, x, rois=None, pos_inds=None, bbox_feats=None): + """Mask head forward function used in both training and testing.""" + mask_results = super()._mask_forward(x, rois, pos_inds, bbox_feats) + if mask_results["mask_pred"].device.type == "hpu": + mask_results["mask_pred"] = mask_results["mask_pred"].cpu() + mask_results["mask_feats"] = mask_results["mask_feats"].cpu() + return mask_results + @HEADS.register_module() class CustomConvFCBBoxHead(Shared2FCBBoxHead, CrossDatasetDetectorHead): @@ -125,6 +133,16 @@ def get_targets(self, sampling_results, gt_bboxes, gt_labels, img_metas, rcnn_tr valid_label_mask = torch.cat(valid_label_mask, 0) return labels, label_weights, bbox_targets, bbox_weights, valid_label_mask + def forward(self, x): + """ConvFCBBoxHead forward.""" + # shared part + cls_score, bbox_pred = super().forward(x) + if cls_score.device.type == "hpu": + cls_score = cls_score.cpu() + bbox_pred = bbox_pred.cpu() + + return cls_score, bbox_pred + @force_fp32(apply_to=("cls_score", "bbox_pred")) def loss( self, diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py new file mode 100644 index 00000000000..b5bb4184fe3 --- /dev/null +++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_rpn_head.py @@ -0,0 +1,25 @@ +"""Custom RPN head for OTX template.""" +# Copyright (C) 2022 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +from mmdet.models.builder import HEADS +from mmdet.models.dense_heads import RPNHead + + +@HEADS.register_module() +class CustomRPNHead(RPNHead): + """RPN head. + + Args: + in_channels (int): Number of channels in the input feature map. + init_cfg (dict or list[dict], optional): Initialization config dict. + num_convs (int): Number of convolution layers in the head. Default 1. + """ + + def forward_single(self, x): + """Forward feature map of a single scale level.""" + rpn_cls_score, rpn_bbox_pred = super().forward_single(x) + if rpn_cls_score.device.type == "hpu": + rpn_cls_score = rpn_cls_score.cpu() + rpn_bbox_pred = rpn_bbox_pred.cpu() + return rpn_cls_score, rpn_bbox_pred diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py index 6d5f1ce8427..7aebbcb3173 100644 --- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py +++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_ssd_head.py @@ -81,6 +81,35 @@ def _init_layers(self): nn.Conv2d(in_channel, num_base_priors * self.cls_out_channels, kernel_size=3, padding=1) ) + def forward(self, feats): + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: + cls_scores (list[Tensor]): Classification scores for all scale + levels, each is a 4D-tensor, the channels number is + num_anchors * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for all scale + levels, each is a 4D-tensor, the channels number is + num_anchors * 4. + """ + cls_scores = [] + bbox_preds = [] + for feat, reg_conv, cls_conv in zip(feats, self.reg_convs, self.cls_convs): + cls_out = cls_conv(feat) + reg_out = reg_conv(feat) + if cls_out.device.type == "hpu": + cls_scores.append(cls_out.cpu()) + bbox_preds.append(reg_out.cpu()) + else: + cls_scores.append(cls_out) + bbox_preds.append(reg_out) + return cls_scores, bbox_preds + def loss_single( self, cls_score, diff --git a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py index 5de9fc272ff..161d692e4f4 100644 --- a/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py +++ b/src/otx/algorithms/detection/adapters/mmdet/models/heads/custom_yolox_head.py @@ -103,6 +103,16 @@ def loss(self, cls_scores, bbox_preds, objectnesses, gt_bboxes, gt_labels, img_m return loss_dict + def forward_single(self, x, cls_convs, reg_convs, conv_cls, conv_reg, conv_obj): + """Forward feature of a single scale level.""" + cls_score, bbox_pred, objectness = super().forward_single(x, cls_convs, reg_convs, conv_cls, conv_reg, conv_obj) + if cls_score.device.type == "hpu": + # put on cpu for further post-processing + cls_score = cls_score.cpu() + bbox_pred = bbox_pred.cpu() + objectness = objectness.cpu() + return cls_score, bbox_pred, objectness + @HEADS.register_module() class CustomYOLOXHeadTrackingLossDynamics(TrackingLossDynamicsMixIn, CustomYOLOXHead): @@ -245,6 +255,7 @@ def _get_target_single(self, cls_preds, objectness, priors, decoded_bboxes, gt_b num_priors = priors.size(0) num_gts = gt_labels.size(0) gt_bboxes = gt_bboxes.to(decoded_bboxes.dtype) + # No target if num_gts == 0: cls_target = cls_preds.new_zeros((0, self.num_classes)) diff --git a/src/otx/algorithms/detection/adapters/mmdet/task.py b/src/otx/algorithms/detection/adapters/mmdet/task.py index 3b8040408be..53c00ceaf32 100644 --- a/src/otx/algorithms/detection/adapters/mmdet/task.py +++ b/src/otx/algorithms/detection/adapters/mmdet/task.py @@ -44,8 +44,8 @@ from otx.algorithms.common.utils.data import get_dataset from otx.algorithms.common.utils.logger import get_logger from otx.algorithms.detection.adapters.mmdet.apis.train import ( - monkey_patched_xpu_nms, - monkey_patched_xpu_roi_align, + monkey_patched_nms, + monkey_patched_roi_align, train_detector, ) from otx.algorithms.detection.adapters.mmdet.configurer import ( @@ -348,9 +348,9 @@ def _infer_model( else: target_classes = mm_dataset.CLASSES - if cfg.device == "xpu": - NMSop.forward = monkey_patched_xpu_nms - RoIAlign.forward = monkey_patched_xpu_roi_align + if cfg.device in ["xpu", "hpu"]: + NMSop.forward = monkey_patched_nms + RoIAlign.forward = monkey_patched_roi_align # Model model = self.build_model(cfg, fp16=cfg.get("fp16", False)) diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/model.py b/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/model.py index 72ca9481ef3..03cb21733dc 100644 --- a/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/model.py +++ b/src/otx/algorithms/detection/configs/instance_segmentation/efficientnetb2b_maskrcnn/model.py @@ -28,7 +28,7 @@ type="CustomMaskRCNN", # Use CustomMaskRCNN for Incremental Learning neck=dict(type="FPN", in_channels=[24, 48, 120, 352], out_channels=80, num_outs=5), rpn_head=dict( - type="RPNHead", + type="CustomRPNHead", in_channels=80, feat_channels=80, anchor_generator=dict(type="AnchorGenerator", scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py b/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py index 66f7522bdee..203470d2fac 100644 --- a/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py +++ b/src/otx/algorithms/detection/configs/instance_segmentation/maskrcnn_swin_t/model.py @@ -38,7 +38,7 @@ ), neck=dict(type="FPN", in_channels=[96, 192, 384, 768], out_channels=256, num_outs=5), rpn_head=dict( - type="RPNHead", + type="CustomRPNHead", in_channels=256, feat_channels=256, anchor_generator=dict(type="AnchorGenerator", scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), diff --git a/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py b/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py index 6832028e425..d8918edc33f 100644 --- a/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py +++ b/src/otx/algorithms/detection/configs/instance_segmentation/resnet50_maskrcnn/model.py @@ -33,7 +33,7 @@ num_outs=5, ), rpn_head=dict( - type="RPNHead", + type="CustomRPNHead", in_channels=256, feat_channels=256, anchor_generator=dict(