diff --git a/config/aligndet_r101v1_fpn_fp16_1x.py b/config/aligndet_r101v1_fpn_fp16_1x.py new file mode 100644 index 0000000..816d94d --- /dev/null +++ b/config/aligndet_r101v1_fpn_fp16_1x.py @@ -0,0 +1,292 @@ +from symbol.builder import FasterRcnn as Detector +from models.retinanet.builder import MSRAResNet101V1FPN as Backbone +from models.retinanet.builder import RetinaNetNeck as Neck +from models.aligndet.builder import AlignRetinaNetHead as RpnHead +from models.aligndet.builder import AlignRoiExtractor as RoiExtractor +from models.aligndet.builder import AlignHead as BboxHead +from mxnext.complicate import normalizer_factory + + +def get_config(is_train): + class General: + log_frequency = 10 + name = __name__.rsplit("/")[-1].rsplit(".")[-1] + batch_image = 2 if is_train else 1 + fp16 = True + + + class KvstoreParam: + kvstore = "local" + batch_image = General.batch_image + gpus = [0, 1, 2, 3, 4, 5, 6, 7] + fp16 = General.fp16 + + + class NormalizeParam: + normalizer = normalizer_factory(type="fixbn") + + + class BackboneParam: + fp16 = General.fp16 + normalizer = NormalizeParam.normalizer + + + class NeckParam: + fp16 = General.fp16 + normalizer = NormalizeParam.normalizer + + + class RpnParam: + fp16 = General.fp16 + normalizer = NormalizeParam.normalizer + batch_image = General.batch_image + num_class = 1 + 80 + loss_weight = 1.0 + + class anchor_generate: + scale = (4 * 2 ** (1.0 / 3.0),) + ratio = (1.0,) + stride = (8, 16, 32, 64, 128) + short = (100, 50, 25, 13, 7) + long = (167, 84, 42, 21, 11) + image_anchor = None + + class head: + conv_channel = 256 + mean = (0.0, 0.0, 0.0, 0.0) + std = (1.0, 1.0, 1.0, 1.0) + + class focal_loss: + alpha = 0.25 + gamma = 2.0 + + class proposal: + pre_nms_top_n = 1000 + min_bbox_side = 0 + min_det_score = 0.05 + + class subsample_proposal: + pass + + class bbox_target: + class_agnostic = False + mean = (0.0, 0.0, 0.0, 0.0) + std = (1.0, 1.0, 1.0, 1.0) + allowed_border = 9999 + pos_thr = 0.7 + neg_thr = 0.7 + min_pos_thr = 0.0 + + + class BboxParam: + fp16 = General.fp16 + normalizer = NormalizeParam.normalizer + num_class = 1 + 80 + loss_weight = 1.0 + + class anchor_generate: + scale = (4 * 2 ** (1.0 / 3.0),) + ratio = (1.0,) + stride = (8, 16, 32, 64, 128) + + class head: + merge_score = False + num_conv = 2 + use_1x1 = True + conv_channel = 1024 + mean = (0.0, 0.0, 0.0, 0.0) + std = (1.0, 1.0, 1.0, 1.0) + + class focal_loss: + alpha = 0.25 + gamma = 2.0 + + class proposal: + pre_nms_top_n = 1000 + min_bbox_side = 0 + min_det_score = 0.05 + + + class RoiParam: + fp16 = General.fp16 + normalizer = NormalizeParam.normalizer + sample_bins = 7 + im2col = True + stride = (8, 16, 32, 64, 128) + conv_channel = 256 * 7 * 7 + scale = (4 * 2 ** (1.0 / 3.0),) + ratio = (1.0,) + + + class DatasetParam: + if is_train: + image_set = ("coco_train2014", "coco_valminusminival2014") + else: + image_set = ("coco_minival2014", ) + + backbone = Backbone(BackboneParam) + neck = Neck(NeckParam) + rpn_head = RpnHead(RpnParam) + roi_extractor = RoiExtractor(RoiParam) + bbox_head = BboxHead(BboxParam) + detector = Detector() + if is_train: + train_sym = detector.get_train_symbol(backbone, neck, rpn_head, roi_extractor, bbox_head) + rpn_test_sym = None + test_sym = None + else: + train_sym = None + rpn_test_sym = None + test_sym = detector.get_test_symbol(backbone, neck, rpn_head, roi_extractor, bbox_head) + + + class ModelParam: + train_symbol = train_sym + test_symbol = test_sym + rpn_test_symbol = rpn_test_sym + + from_scratch = False + random = True + memonger = False + memonger_until = "stage3_unit21_plus" + + class pretrain: + prefix = "pretrain_model/resnet-v1-101" + epoch = 0 + fixed_param = ["conv0", "stage1", "gamma", "beta"] + + + class OptimizeParam: + class optimizer: + type = "sgd" + lr = 0.005 / 8 * len(KvstoreParam.gpus) * KvstoreParam.batch_image + momentum = 0.9 + wd = 0.0001 + clip_gradient = None + + class schedule: + begin_epoch = 0 + end_epoch = 6 + lr_iter = [60000 * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image), + 80000 * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image)] + + class warmup: + type = "gradual" + lr = 0.005 / 8 * len(KvstoreParam.gpus) * KvstoreParam.batch_image / 3 + iter = 500 + + + class TestParam: + min_det_score = 0 # filter appended boxes + max_det_per_image = 100 + + process_roidb = lambda x: x + process_output = lambda x, y: x + + class model: + prefix = "experiments/{}/checkpoint".format(General.name) + epoch = OptimizeParam.schedule.end_epoch + + class nms: + type = "nms" + thr = 0.5 + + class coco: + annotation = "data/coco/annotations/instances_minival2014.json" + + # data processing + class NormParam: + mean = (122.7717, 115.9465, 102.9801) # RGB order + std = (1.0, 1.0, 1.0) + + + class ResizeParam: + short = 800 + long = 1333 + + + class PadParam: + short = 800 + long = 1333 + max_num_gt = 100 + + + class AnchorTarget2DParam: + def __init__(self): + self.generate = self._generate() + self.mean = (0.0, 0.0, 0.0, 0.0) + self.std = (1.0, 1.0, 1.0, 1.0) + self.class_agnostic = False + + class _generate: + def __init__(self): + self.short = (100, 50, 25, 13, 7) + self.long = (167, 84, 42, 21, 11) + self.stride = (8, 16, 32, 64, 128) + + scales = (4 * 2 ** (1.0 / 3.0),) + aspects = (1.0,) + + class assign: + allowed_border = 9999 + pos_thr = 0.4 + neg_thr = 0.3 + min_pos_thr = 0.0 + + class sample: + image_anchor = None + pos_fraction = None + + + class RenameParam: + mapping = dict(image="data") + + + from core.detection_input import ReadRoiRecord, Resize2DImageBbox, \ + ConvertImageFromHwcToChw, Flip2DImageBbox, Pad2DImageBbox, \ + RenameRecord + from models.retinanet.input import PyramidAnchorTarget2D, Norm2DImage + + if is_train: + transform = [ + ReadRoiRecord(None), + Norm2DImage(NormParam), + Resize2DImageBbox(ResizeParam), + Flip2DImageBbox(), + Pad2DImageBbox(PadParam), + ConvertImageFromHwcToChw(), + PyramidAnchorTarget2D(AnchorTarget2DParam()), + RenameRecord(RenameParam.mapping) + ] + data_name = ["data", "im_info", "gt_bbox"] + label_name = ["rpn_cls_label", "rpn_reg_target", "rpn_reg_weight"] + else: + transform = [ + ReadRoiRecord(None), + Norm2DImage(NormParam), + Resize2DImageBbox(ResizeParam), + ConvertImageFromHwcToChw(), + RenameRecord(RenameParam.mapping) + ] + data_name = ["data", "im_info", "im_id", "rec_id"] + label_name = [] + + from models.retinanet import metric + + rpn_acc_metric = metric.FGAccMetric( + "FGAcc", + ["cls_loss_output"], + ["rpn_cls_label"] + ) + + bbox_acc_metric = metric.FGAccMetric( + "BboxFGAcc", + ["align_cls_loss_output", "align_label_blockgrad_output"], + [] + ) + + metric_list = [rpn_acc_metric, bbox_acc_metric] + + return General, KvstoreParam, RpnParam, RoiParam, BboxParam, DatasetParam, \ + ModelParam, OptimizeParam, TestParam, \ + transform, data_name, label_name, metric_list diff --git a/config/aligndet_r50v1_fpn_fp16_1x.py b/config/aligndet_r50v1_fpn_fp16_1x.py new file mode 100644 index 0000000..f85715e --- /dev/null +++ b/config/aligndet_r50v1_fpn_fp16_1x.py @@ -0,0 +1,292 @@ +from symbol.builder import FasterRcnn as Detector +from models.retinanet.builder import MSRAResNet50V1FPN as Backbone +from models.retinanet.builder import RetinaNetNeck as Neck +from models.aligndet.builder import AlignRetinaNetHead as RpnHead +from models.aligndet.builder import AlignRoiExtractor as RoiExtractor +from models.aligndet.builder import AlignHead as BboxHead +from mxnext.complicate import normalizer_factory + + +def get_config(is_train): + class General: + log_frequency = 10 + name = __name__.rsplit("/")[-1].rsplit(".")[-1] + batch_image = 2 if is_train else 1 + fp16 = True + + + class KvstoreParam: + kvstore = "local" + batch_image = General.batch_image + gpus = [0, 1, 2, 3, 4, 5, 6, 7] + fp16 = General.fp16 + + + class NormalizeParam: + normalizer = normalizer_factory(type="fixbn") + + + class BackboneParam: + fp16 = General.fp16 + normalizer = NormalizeParam.normalizer + + + class NeckParam: + fp16 = General.fp16 + normalizer = NormalizeParam.normalizer + + + class RpnParam: + fp16 = General.fp16 + normalizer = NormalizeParam.normalizer + batch_image = General.batch_image + num_class = 1 + 80 + loss_weight = 1.0 + + class anchor_generate: + scale = (4 * 2 ** (1.0 / 3.0),) + ratio = (1.0,) + stride = (8, 16, 32, 64, 128) + short = (100, 50, 25, 13, 7) + long = (167, 84, 42, 21, 11) + image_anchor = None + + class head: + conv_channel = 256 + mean = (0.0, 0.0, 0.0, 0.0) + std = (1.0, 1.0, 1.0, 1.0) + + class focal_loss: + alpha = 0.25 + gamma = 2.0 + + class proposal: + pre_nms_top_n = 1000 + min_bbox_side = 0 + min_det_score = 0.05 + + class subsample_proposal: + pass + + class bbox_target: + class_agnostic = False + mean = (0.0, 0.0, 0.0, 0.0) + std = (1.0, 1.0, 1.0, 1.0) + allowed_border = 9999 + pos_thr = 0.7 + neg_thr = 0.7 + min_pos_thr = 0.0 + + + class BboxParam: + fp16 = General.fp16 + normalizer = NormalizeParam.normalizer + num_class = 1 + 80 + loss_weight = 1.0 + + class anchor_generate: + scale = (4 * 2 ** (1.0 / 3.0),) + ratio = (1.0,) + stride = (8, 16, 32, 64, 128) + + class head: + merge_score = False + num_conv = 2 + use_1x1 = True + conv_channel = 1024 + mean = (0.0, 0.0, 0.0, 0.0) + std = (1.0, 1.0, 1.0, 1.0) + + class focal_loss: + alpha = 0.25 + gamma = 2.0 + + class proposal: + pre_nms_top_n = 1000 + min_bbox_side = 0 + min_det_score = 0.05 + + + class RoiParam: + fp16 = General.fp16 + normalizer = NormalizeParam.normalizer + sample_bins = 7 + im2col = True + stride = (8, 16, 32, 64, 128) + conv_channel = 256 * 7 * 7 + scale = (4 * 2 ** (1.0 / 3.0),) + ratio = (1.0,) + + + class DatasetParam: + if is_train: + image_set = ("coco_train2014", "coco_valminusminival2014") + else: + image_set = ("coco_minival2014", ) + + backbone = Backbone(BackboneParam) + neck = Neck(NeckParam) + rpn_head = RpnHead(RpnParam) + roi_extractor = RoiExtractor(RoiParam) + bbox_head = BboxHead(BboxParam) + detector = Detector() + if is_train: + train_sym = detector.get_train_symbol(backbone, neck, rpn_head, roi_extractor, bbox_head) + rpn_test_sym = None + test_sym = None + else: + train_sym = None + rpn_test_sym = None + test_sym = detector.get_test_symbol(backbone, neck, rpn_head, roi_extractor, bbox_head) + + + class ModelParam: + train_symbol = train_sym + test_symbol = test_sym + rpn_test_symbol = rpn_test_sym + + from_scratch = False + random = True + memonger = False + memonger_until = "stage3_unit21_plus" + + class pretrain: + prefix = "pretrain_model/resnet-v1-50" + epoch = 0 + fixed_param = ["conv0", "stage1", "gamma", "beta"] + + + class OptimizeParam: + class optimizer: + type = "sgd" + lr = 0.005 / 8 * len(KvstoreParam.gpus) * KvstoreParam.batch_image + momentum = 0.9 + wd = 0.0001 + clip_gradient = None + + class schedule: + begin_epoch = 0 + end_epoch = 6 + lr_iter = [60000 * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image), + 80000 * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image)] + + class warmup: + type = "gradual" + lr = 0.005 / 8 * len(KvstoreParam.gpus) * KvstoreParam.batch_image / 3 + iter = 500 + + + class TestParam: + min_det_score = 0 # filter appended boxes + max_det_per_image = 100 + + process_roidb = lambda x: x + process_output = lambda x, y: x + + class model: + prefix = "experiments/{}/checkpoint".format(General.name) + epoch = OptimizeParam.schedule.end_epoch + + class nms: + type = "nms" + thr = 0.5 + + class coco: + annotation = "data/coco/annotations/instances_minival2014.json" + + # data processing + class NormParam: + mean = (122.7717, 115.9465, 102.9801) # RGB order + std = (1.0, 1.0, 1.0) + + + class ResizeParam: + short = 800 + long = 1333 + + + class PadParam: + short = 800 + long = 1333 + max_num_gt = 100 + + + class AnchorTarget2DParam: + def __init__(self): + self.generate = self._generate() + self.mean = (0.0, 0.0, 0.0, 0.0) + self.std = (1.0, 1.0, 1.0, 1.0) + self.class_agnostic = False + + class _generate: + def __init__(self): + self.short = (100, 50, 25, 13, 7) + self.long = (167, 84, 42, 21, 11) + self.stride = (8, 16, 32, 64, 128) + + scales = (4 * 2 ** (1.0 / 3.0),) + aspects = (1.0,) + + class assign: + allowed_border = 9999 + pos_thr = 0.4 + neg_thr = 0.3 + min_pos_thr = 0.0 + + class sample: + image_anchor = None + pos_fraction = None + + + class RenameParam: + mapping = dict(image="data") + + + from core.detection_input import ReadRoiRecord, Resize2DImageBbox, \ + ConvertImageFromHwcToChw, Flip2DImageBbox, Pad2DImageBbox, \ + RenameRecord + from models.retinanet.input import PyramidAnchorTarget2D, Norm2DImage + + if is_train: + transform = [ + ReadRoiRecord(None), + Norm2DImage(NormParam), + Resize2DImageBbox(ResizeParam), + Flip2DImageBbox(), + Pad2DImageBbox(PadParam), + ConvertImageFromHwcToChw(), + PyramidAnchorTarget2D(AnchorTarget2DParam()), + RenameRecord(RenameParam.mapping) + ] + data_name = ["data", "im_info", "gt_bbox"] + label_name = ["rpn_cls_label", "rpn_reg_target", "rpn_reg_weight"] + else: + transform = [ + ReadRoiRecord(None), + Norm2DImage(NormParam), + Resize2DImageBbox(ResizeParam), + ConvertImageFromHwcToChw(), + RenameRecord(RenameParam.mapping) + ] + data_name = ["data", "im_info", "im_id", "rec_id"] + label_name = [] + + from models.retinanet import metric + + rpn_acc_metric = metric.FGAccMetric( + "FGAcc", + ["cls_loss_output"], + ["rpn_cls_label"] + ) + + bbox_acc_metric = metric.FGAccMetric( + "BboxFGAcc", + ["align_cls_loss_output", "align_label_blockgrad_output"], + [] + ) + + metric_list = [rpn_acc_metric, bbox_acc_metric] + + return General, KvstoreParam, RpnParam, RoiParam, BboxParam, DatasetParam, \ + ModelParam, OptimizeParam, TestParam, \ + transform, data_name, label_name, metric_list diff --git a/config/aligndet_r50v1_fpn_ignorep3_fp16_1x.py b/config/aligndet_r50v1_fpn_ignorep3_fp16_1x.py new file mode 100644 index 0000000..6e39672 --- /dev/null +++ b/config/aligndet_r50v1_fpn_ignorep3_fp16_1x.py @@ -0,0 +1,294 @@ +from symbol.builder import FasterRcnn as Detector +from models.retinanet.builder import MSRAResNet50V1FPN as Backbone +from models.retinanet.builder import RetinaNetNeck as Neck +from models.aligndet.builder import AlignRetinaNetHead as RpnHead +from models.aligndet.builder import AlignRoiExtractor as RoiExtractor +from models.aligndet.builder import AlignHead as BboxHead +from mxnext.complicate import normalizer_factory + + +def get_config(is_train): + class General: + log_frequency = 10 + name = __name__.rsplit("/")[-1].rsplit(".")[-1] + batch_image = 2 if is_train else 1 + fp16 = True + + + class KvstoreParam: + kvstore = "local" + batch_image = General.batch_image + gpus = [0, 1, 2, 3, 4, 5, 6, 7] + fp16 = General.fp16 + + + class NormalizeParam: + normalizer = normalizer_factory(type="fixbn") + + + class BackboneParam: + fp16 = General.fp16 + normalizer = NormalizeParam.normalizer + + + class NeckParam: + fp16 = General.fp16 + normalizer = NormalizeParam.normalizer + + + class RpnParam: + fp16 = General.fp16 + normalizer = NormalizeParam.normalizer + batch_image = General.batch_image + num_class = 1 + 80 + loss_weight = 1.0 + + class anchor_generate: + scale = (4 * 2 ** (1.0 / 3.0),) + ratio = (1.0,) + stride = (8, 16, 32, 64, 128) + short = (100, 50, 25, 13, 7) + long = (167, 84, 42, 21, 11) + image_anchor = None + + class head: + conv_channel = 256 + mean = (0.0, 0.0, 0.0, 0.0) + std = (1.0, 1.0, 1.0, 1.0) + + class focal_loss: + alpha = 0.25 + gamma = 2.0 + + class proposal: + pre_nms_top_n = 1000 + min_bbox_side = 0 + min_det_score = 0.05 + + class subsample_proposal: + pass + + class bbox_target: + class_agnostic = False + mean = (0.0, 0.0, 0.0, 0.0) + std = (1.0, 1.0, 1.0, 1.0) + allowed_border = 9999 + pos_thr = 0.7 + neg_thr = 0.7 + min_pos_thr = 0.0 + + + class BboxParam: + fp16 = General.fp16 + normalizer = NormalizeParam.normalizer + num_class = 1 + 80 + loss_weight = 1.0 + + class anchor_generate: + scale = (4 * 2 ** (1.0 / 3.0),) + ratio = (1.0,) + stride = (8, 16, 32, 64, 128) + + class head: + merge_score = False + num_conv = 2 + use_1x1 = True + conv_channel = 1024 + mean = (0.0, 0.0, 0.0, 0.0) + std = (1.0, 1.0, 1.0, 1.0) + ignore_p3 = True + + class focal_loss: + alpha = 0.25 + gamma = 2.0 + + class proposal: + pre_nms_top_n = 1000 + min_bbox_side = 0 + min_det_score = 0.05 + + + class RoiParam: + fp16 = General.fp16 + normalizer = NormalizeParam.normalizer + sample_bins = 7 + im2col = True + stride = (8, 16, 32, 64, 128) + conv_channel = 256 * 7 * 7 + scale = (4 * 2 ** (1.0 / 3.0),) + ratio = (1.0,) + ignore_p3 = True + + + class DatasetParam: + if is_train: + image_set = ("coco_train2014", "coco_valminusminival2014") + else: + image_set = ("coco_minival2014", ) + + backbone = Backbone(BackboneParam) + neck = Neck(NeckParam) + rpn_head = RpnHead(RpnParam) + roi_extractor = RoiExtractor(RoiParam) + bbox_head = BboxHead(BboxParam) + detector = Detector() + if is_train: + train_sym = detector.get_train_symbol(backbone, neck, rpn_head, roi_extractor, bbox_head) + rpn_test_sym = None + test_sym = None + else: + train_sym = None + rpn_test_sym = None + test_sym = detector.get_test_symbol(backbone, neck, rpn_head, roi_extractor, bbox_head) + + + class ModelParam: + train_symbol = train_sym + test_symbol = test_sym + rpn_test_symbol = rpn_test_sym + + from_scratch = False + random = True + memonger = False + memonger_until = "stage3_unit21_plus" + + class pretrain: + prefix = "pretrain_model/resnet-v1-50" + epoch = 0 + fixed_param = ["conv0", "stage1", "gamma", "beta"] + + + class OptimizeParam: + class optimizer: + type = "sgd" + lr = 0.005 / 8 * len(KvstoreParam.gpus) * KvstoreParam.batch_image + momentum = 0.9 + wd = 0.0001 + clip_gradient = None + + class schedule: + begin_epoch = 0 + end_epoch = 6 + lr_iter = [60000 * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image), + 80000 * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image)] + + class warmup: + type = "gradual" + lr = 0.005 / 8 * len(KvstoreParam.gpus) * KvstoreParam.batch_image / 3 + iter = 500 + + + class TestParam: + min_det_score = 0 # filter appended boxes + max_det_per_image = 100 + + process_roidb = lambda x: x + process_output = lambda x, y: x + + class model: + prefix = "experiments/{}/checkpoint".format(General.name) + epoch = OptimizeParam.schedule.end_epoch + + class nms: + type = "nms" + thr = 0.5 + + class coco: + annotation = "data/coco/annotations/instances_minival2014.json" + + # data processing + class NormParam: + mean = (122.7717, 115.9465, 102.9801) # RGB order + std = (1.0, 1.0, 1.0) + + + class ResizeParam: + short = 800 + long = 1333 + + + class PadParam: + short = 800 + long = 1333 + max_num_gt = 100 + + + class AnchorTarget2DParam: + def __init__(self): + self.generate = self._generate() + self.mean = (0.0, 0.0, 0.0, 0.0) + self.std = (1.0, 1.0, 1.0, 1.0) + self.class_agnostic = False + + class _generate: + def __init__(self): + self.short = (100, 50, 25, 13, 7) + self.long = (167, 84, 42, 21, 11) + self.stride = (8, 16, 32, 64, 128) + + scales = (4 * 2 ** (1.0 / 3.0),) + aspects = (1.0,) + + class assign: + allowed_border = 9999 + pos_thr = 0.4 + neg_thr = 0.3 + min_pos_thr = 0.0 + + class sample: + image_anchor = None + pos_fraction = None + + + class RenameParam: + mapping = dict(image="data") + + + from core.detection_input import ReadRoiRecord, Resize2DImageBbox, \ + ConvertImageFromHwcToChw, Flip2DImageBbox, Pad2DImageBbox, \ + RenameRecord + from models.retinanet.input import PyramidAnchorTarget2D, Norm2DImage + + if is_train: + transform = [ + ReadRoiRecord(None), + Norm2DImage(NormParam), + Resize2DImageBbox(ResizeParam), + Flip2DImageBbox(), + Pad2DImageBbox(PadParam), + ConvertImageFromHwcToChw(), + PyramidAnchorTarget2D(AnchorTarget2DParam()), + RenameRecord(RenameParam.mapping) + ] + data_name = ["data", "im_info", "gt_bbox"] + label_name = ["rpn_cls_label", "rpn_reg_target", "rpn_reg_weight"] + else: + transform = [ + ReadRoiRecord(None), + Norm2DImage(NormParam), + Resize2DImageBbox(ResizeParam), + ConvertImageFromHwcToChw(), + RenameRecord(RenameParam.mapping) + ] + data_name = ["data", "im_info", "im_id", "rec_id"] + label_name = [] + + from models.retinanet import metric + + rpn_acc_metric = metric.FGAccMetric( + "FGAcc", + ["cls_loss_output"], + ["rpn_cls_label"] + ) + + bbox_acc_metric = metric.FGAccMetric( + "BboxFGAcc", + ["align_cls_loss_output", "align_label_blockgrad_output"], + [] + ) + + metric_list = [rpn_acc_metric, bbox_acc_metric] + + return General, KvstoreParam, RpnParam, RoiParam, BboxParam, DatasetParam, \ + ModelParam, OptimizeParam, TestParam, \ + transform, data_name, label_name, metric_list diff --git a/models/aligndet/__init__.py b/models/aligndet/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/aligndet/builder.py b/models/aligndet/builder.py new file mode 100644 index 0000000..62c3f72 --- /dev/null +++ b/models/aligndet/builder.py @@ -0,0 +1,939 @@ +from __future__ import division +from __future__ import print_function + +import mxnet as mx +import mxnext as X +import math + +from symbol.builder import BboxHead, RoiExtractor +from models.retinanet.builder import RetinaNetHead + + +class AlignRetinaNetHead(RetinaNetHead): + def __init__(self, pRpn): + super(AlignRetinaNetHead, self).__init__(pRpn) + self._proposal = None + + def get_all_proposal(self, conv_feat, im_info): + if self._proposal is not None: + return self._proposal + + p = self.p + batch_image = p.batch_image + stride = p.anchor_generate.stride + if not isinstance(stride, tuple): + stride = (stride, ) + ratios = p.anchor_generate.ratio + scales = p.anchor_generate.scale + anchor_target_mean = p.head.mean + anchor_target_std = p.head.std + num_base_anchor = len(ratios) * len(scales) + pick_anchor = p.pick_anchor or False + nms = p.nms or False + + cls_logit_dict, bbox_delta_dict = self.get_output(conv_feat) + + proposal_dict, proposal_score_dict = dict(), dict() + + for s in stride: + """ + cls_prob: (N, A * C, H, W) + bbox_delta: (N, A * 4, H, W) + """ + cls_prob = X.sigmoid(data=cls_logit_dict["stride%s" % s]) + bbox_delta = bbox_delta_dict["stride%s" % s] + + # (N, A * 4, H, W) -> (N, A, 4, H * W) + bbox_delta = X.reshape( + data=bbox_delta, + shape=(0, num_base_anchor, 4, -1), + name="bbox_delta_reshape_stride%s" % s + ) + # (N, A, 4, H * W) -> (N, H * W, A, 4) + bbox_delta = X.transpose( + data=bbox_delta, + axes=(0, 3, 1, 2), + name="bbox_delta_reshape_transpose_stride%s" % s + ) + # (N, H * W, A, 4) -> (N, H * W * A, 4) + bbox_delta = X.reshape( + data=bbox_delta, + shape=(0, -1, 4), + name="bbox_delta_reshape_transpose_reshape_stride%s" % s + ) + + anchor = mx.sym.contrib.GenAnchor( + cls_prob=bbox_delta_dict["stride%s" % s], + feature_stride=s, + scales=tuple(scales), + ratios=tuple(ratios), + name="anchors_stride%s" % s + ) + + # decode anchor + bbox_delta_list = mx.sym.split(bbox_delta, num_outputs=batch_image, axis=0, squeeze_axis=False) + im_info_list = mx.sym.split(im_info, num_outputs=batch_image, axis=0, squeeze_axis=False) + bbox_xyxy_list = list() + anchor_expand_dims = anchor.expand_dims(axis=0) + for bbox_delta_i, im_info_i in zip(bbox_delta_list, im_info_list): + pad_zero = mx.sym.zeros_like(bbox_delta_i) + bbox_delta_i = mx.sym.concat(pad_zero, bbox_delta_i, dim=-1) + bbox_xyxy_i = X.decode_bbox( + rois=anchor_expand_dims, + bbox_pred=bbox_delta_i, + im_info=im_info_i, + bbox_mean=anchor_target_mean, + bbox_std=anchor_target_std, + class_agnostic=True + ) + bbox_xyxy_list.append(bbox_xyxy_i) + bbox_xyxy = mx.sym.concat(*bbox_xyxy_list, dim=0, name="proposal_stride%s_retina" % s) + + proposal_dict["stride%s" % s] = bbox_xyxy + proposal_score_dict["stride%s" % s] = cls_prob + + if pick_anchor: + for s in stride: + cls_score = proposal_score_dict["stride%s"%s] + bbox_xyxy = proposal_dict["stride%s"%s] + + # (N, A * C, H, W) -> (N, H * W, A), C = 1 + cls_score = cls_score.transpose((0, 2, 3, 1)) + cls_score = cls_score.reshape((0, -3, 0)) + # (N, H * W * A, 4) -> (N, H * W, A, 4) + bbox_xyxy = bbox_xyxy.reshape((0, -1, num_base_anchor, 4)) + + argmax_cls_score = cls_score.argmax(axis=2) + argmax_cls_score_stack = mx.sym.stack(*([argmax_cls_score] * 4), axis=2) + + sample_cls_score = mx.sym.pick(cls_score, argmax_cls_score, axis=2) + sample_bbox_xyxy = mx.sym.pick(bbox_xyxy, argmax_cls_score_stack, axis=2) + + # (N, H * W) -> (N, A * C, H * W), A = C = 1 + sample_cls_score = sample_cls_score.reshape((0, 1, -1)) + + proposal_score_dict["stride%s"%s] = sample_cls_score + proposal_dict["stride%s"%s] = sample_bbox_xyxy + elif nms: + nms_thr = p.nms_thr + + for s in stride: + cls_score = proposal_score_dict["stride%s"%s] + bbox_xyxy = proposal_dict["stride%s"%s] + + # (N, A * C, H, W) -> (N, H * W, A, C) + cls_score = cls_score.reshape((0, 0, -1)) + cls_score = cls_score.transpose((0, 2, 1)) + cls_score = cls_score.reshape((0, 0, num_base_anchor, -1)) + # (N, H * W, A, C) -> (N, H * W, A, 1) + cls_score = mx.sym.sum(cls_score, axis=3, keepdims=True) + proposal_score_dict["stride%s"%s] = cls_score + # (N, H * W * A, 4) -> (N, H * W, A, 4) + bbox_xyxy = bbox_xyxy.reshape((0, -1, num_base_anchor, 4)) + + (cls_score, bbox_xyxy) = mx.sym.contrib.InplaceNMS( + cls_prob=cls_score, + bbox_pred=bbox_xyxy, + nms_thr=nms_thr, + score_reset_value=0, + bbox_reset_value=999999, # larger than allowed_border + name="inplace_nms" + ) + + # (N, H * W, A, C) -> (N, A * C, H, W), C = 1 + cls_score = cls_score.reshape((0, 0, -3)) + cls_score = cls_score.transpose((0, 2, 1)) + cls_score.reshape_like(proposal_score_dict["stride%s"%s]) + # (N, H * W, A, 4) -> (N, H * W * A, 4) + bbox_xyxy = bbox_xyxy.reshape_like(proposal_dict["stride%s"%s]) + + proposal_dict["stride%s"%s] = bbox_xyxy + proposal_score_dict["stride%s"%s] = cls_score + + proposal = (proposal_dict, proposal_score_dict) + self._proposal = proposal + + return proposal + + def get_sampled_proposal(self, conv_feat, gt_bbox, im_info): + p = self.p + + mean = p.bbox_target.mean + std = p.bbox_target.std + class_agnostic = p.bbox_target.class_agnostic + short = p.anchor_generate.short + long = p.anchor_generate.long + stride = p.anchor_generate.stride + if not isinstance(stride, tuple): + stride = (stride, ) + num_anchors = len(p.anchor_generate.scale) * len(p.anchor_generate.ratio) + allowed_border = p.bbox_target.allowed_border + pos_thr = p.bbox_target.pos_thr + neg_thr = p.bbox_target.neg_thr + min_pos_thr = p.bbox_target.min_pos_thr + pick_anchor = p.pick_anchor or False + if pick_anchor: + num_anchors = 1 + + (anchor_dict, anchor_score_dict) = self.get_all_proposal(conv_feat, im_info) + + # custom op to encode new target + from models.aligndet import encode_anchor # noqa: F401 + + (label, bbox_target, bbox_weight) = mx.sym.Custom( + op_type="encode_anchor", + gt_boxes=gt_bbox, + im_info=im_info, + short=short, + long=long, + stride=stride, + num_anchors=num_anchors, + class_agnostic=class_agnostic, + allowed_border=allowed_border, + pos_thr=pos_thr, + neg_thr=neg_thr, + min_pos_thr=min_pos_thr, + mean=mean, + std=std, + name="encode_anchor", + **anchor_dict + ) + + return anchor_dict, label, bbox_target, bbox_weight + + def get_loss(self, conv_feat, cls_label, bbox_target, bbox_weight): + p = self.p + stride = p.anchor_generate.stride + if not isinstance(stride, tuple): + stride = (stride) + num_class = p.num_class + loss_weight = p.loss_weight + num_base_anchor = len(p.anchor_generate.ratio) * len(p.anchor_generate.scale) + scale_loss_shift = 128.0 if p.fp16 else 1.0 + reg_only = p.reg_only or False + + cls_logit_dict, bbox_delta_dict = self.get_output(conv_feat) + cls_logit_reshape_list = [] + bbox_delta_reshape_list = [] + + # reshape logit and delta + for i, s in enumerate(stride): + # (N, A * C, H, W) -> (N, A, C, H * W) + cls_logit = X.reshape( + data=cls_logit_dict["stride%s" % s], + shape=(0, num_base_anchor, num_class-1, -1), + name="cls_stride%s_reshape" % s + ) + # (N, A, C, H * W) -> (N, A, H * W, C) + cls_logit = X.transpose( + data=cls_logit, + axes=(0, 1, 3, 2), + name="cls_stride%s_transpose" % s + ) + # (N, A, H * W, C) -> (N, A * H * W, C) + cls_logit = X.reshape( + data=cls_logit, + shape=(0, -3, 0), + name="cls_stride%s_transpose_reshape" % s + ) + + # (N, A * 4, H, W) -> (N, A * 4, H * W) + bbox_delta = X.reshape( + data=bbox_delta_dict["stride%s" % s], + shape=(0, 0, -1), + name="bbox_stride%s_reshape" % s + ) + + cls_logit_reshape_list.append(cls_logit) + bbox_delta_reshape_list.append(bbox_delta) + + cls_logit_concat = X.concat(cls_logit_reshape_list, axis=1, name="bbox_logit_concat") + bbox_delta_concat = X.concat(bbox_delta_reshape_list, axis=2, name="bbox_delta_concat") + + # classification loss + cls_loss = X.focal_loss( + data=cls_logit_concat, + label=cls_label, + normalization='valid', + alpha=p.focal_loss.alpha, + gamma=p.focal_loss.gamma, + grad_scale=0.0 if reg_only else 1.0 * loss_weight * scale_loss_shift, + workspace=1024, + name="cls_loss" + ) + + scalar = 0.11 + # regression loss + bbox_norm = X.bbox_norm( + data=bbox_delta_concat - bbox_target, + label=cls_label, + name="bbox_norm" + ) + bbox_loss = bbox_weight * X.smooth_l1( + data=bbox_norm, + scalar=math.sqrt(1/scalar), + name="bbox_loss" + ) + reg_loss = X.make_loss( + data=bbox_loss, + grad_scale=1.0 * loss_weight * scale_loss_shift, + name="reg_loss" + ) + + return cls_loss, reg_loss + + +class AlignHead(BboxHead): + def __init__(self, pBbox): + super(AlignHead, self).__init__(pBbox) + + p = self.p + num_conv = p.head.num_conv + init = p.head.init or X.gauss(0.01) + stage = p.stage or "" + ignore_p3 = p.head.ignore_p3 or False + + self.align_conv_weight = [X.var("align_conv_%d_weight%s" % (i + 1, stage), init=init) for i in range(num_conv)] + self.align_conv_bias = [X.var("align_conv_%d_bias%s" % (i + 1, stage), init=X.zero_init()) for i in range(num_conv)] + + if ignore_p3: + self.align_conv_p3_weight = [X.var("align_conv_p3_%d_weight%s" % (i + 1, stage), init=init) for i in range(num_conv)] + self.align_conv_p3_bias = [X.var("align_conv_p3_%d_bias%s" % (i + 1, stage), init=init) for i in range(num_conv)] + + self._head_feat_dict = None + self._cls_logit_dict = None + self._bbox_delta_dict = None + + self.stage = stage + + def _get_bbox_head_logit(self, conv_feat, conv_channel, ignore_p3=False): + p = self.p + num_conv = p.head.num_conv + use_1x1 = p.head.use_1x1 or False + + for i in range(num_conv): + conv_feat = X.conv( + data=conv_feat, + kernel=1 if use_1x1 else 3, + filter=conv_channel, + weight=self.align_conv_p3_weight[i] if ignore_p3 else self.align_conv_weight[i], + bias=self.align_conv_p3_bias[i] if ignore_p3 else self.align_conv_bias[i], + no_bias=False, + name="align_conv_%d%s" % (i + 1, self.stage) + ) + conv_feat = X.relu(conv_feat) + + if p.fp16: + conv_feat = X.to_fp32(conv_feat, name="align_conv_fp32") + + return conv_feat + + def get_output(self, conv_feat): + if self._cls_logit_dict is not None and self._bbox_delta_dict is not None: + return self._cls_logit_dict, self._bbox_delta_dict + + p = self.p + stride = p.anchor_generate.stride + if not isinstance(stride, tuple): + stride = (stride, ) + conv_channel = p.head.conv_channel + num_base_anchor = len(p.anchor_generate.ratio) * len(p.anchor_generate.scale) + num_class = p.num_class + separate_predictor = p.head.separate_predictor or False + batch_image = p.batch_image + ignore_p3 = p.head.ignore_p3 or False + + prior_prob = 0.01 + pi = -math.log((1 - prior_prob) / prior_prob) + + stage = self.stage + align_conv_cls_weight = X.var("align_conv_cls_weight%s" % stage, init=X.gauss(std=0.01)) + align_conv_cls_bias = X.var("align_conv_cls_bias%s" % stage, init=X.constant(pi)) + align_conv_bbox_weight = X.var("align_conv_bbox_weight%s" % stage, init=X.gauss(std=0.01)) + align_conv_bbox_bias = X.var("align_conv_bbox_bias%s" % stage, init=X.zero_init()) + + head_feat_dict = {} + cls_logit_dict = {} + bbox_delta_dict = {} + + for s in stride: + align_conv_relu = self._get_bbox_head_logit( + conv_feat=conv_feat["stride%s" % s], + conv_channel=conv_channel, + ignore_p3=ignore_p3 and s == 8 + ) + + head_feat_dict["stride%s" % s] = align_conv_relu + + if separate_predictor: + align_conv_relu = X.reshape(align_conv_relu, shape=(batch_image, -1, 0, 0)) + + cls_logit = X.conv( + align_conv_relu, + num_group=num_base_anchor, + filter=num_base_anchor * (num_class - 1), + no_bias=False, + weight=align_conv_cls_weight, + bias=align_conv_cls_bias, + name="align_cls_score_stride%s" % s + ) + + bbox_delta = X.conv( + align_conv_relu, + num_group=num_base_anchor, + filter=num_base_anchor * 4, + no_bias=False, + weight=align_conv_bbox_weight, + bias=align_conv_bbox_bias, + name="align_bbox_pred_stride%s" % s + ) + + cls_logit_dict["stride%s" % s] = cls_logit + bbox_delta_dict["stride%s" % s] = bbox_delta + else: + cls_logit = X.conv( + align_conv_relu, + filter=num_class - 1, + no_bias=False, + weight=align_conv_cls_weight, + bias=align_conv_cls_bias, + name="align_cls_score_stride%s" % s + ) + + bbox_delta = X.conv( + align_conv_relu, + filter=4, + no_bias=False, + weight=align_conv_bbox_weight, + bias=align_conv_bbox_bias, + name="align_bbox_pred_stride%s" % s + ) + + cls_logit_dict["stride%s" % s] = cls_logit.reshape(shape=(-1, num_base_anchor * (num_class - 1), 0, 0)) + bbox_delta_dict["stride%s" % s] = bbox_delta.reshape(shape=(-1, num_base_anchor * 4, 0, 0)) + + self._head_feat_dict = head_feat_dict + self._cls_logit_dict = cls_logit_dict + self._bbox_delta_dict = bbox_delta_dict + + return cls_logit_dict, bbox_delta_dict + + def get_prediction(self, conv_feat, im_info, proposal): + p = self.p + merge_score = p.head.merge_score or False + use_prev_bbox = p.head.use_prev_bbox or False + use_prev_score = p.head.use_prev_score or False + assert not (merge_score and use_prev_score), "merge_score confilicts with use_prev_score" + stride = p.anchor_generate.stride + if not isinstance(stride, tuple): + stride = (stride, ) + ratios = p.anchor_generate.ratio + scales = p.anchor_generate.scale + pre_nms_top_n = p.proposal.pre_nms_top_n + min_bbox_side = p.proposal.min_bbox_side + min_det_score = p.proposal.min_det_score + anchor_target_mean = p.head.mean + anchor_target_std = p.head.std + num_anchors = len(ratios) * len(scales) + + cls_logit_dict, bbox_delta_dict = self.get_output(conv_feat) + + bbox_xyxy_list = [] + cls_score_list = [] + + proposal, proposal_score = proposal + + for i, s in enumerate(stride): + cls_prob = X.sigmoid(data=cls_logit_dict["stride%s" % s]) + bbox_delta = bbox_delta_dict["stride%s" % s] + anchors = proposal["stride%s" % s] + anchor_scores = proposal_score["stride%s" % s] + + if merge_score: + cls_prob = cls_prob + anchor_scores + + if use_prev_bbox: + bbox_delta = mx.sym.zeros_like(bbox_delta) + + if use_prev_score: + cls_prob = anchor_scores + + thresh_level = 0 if s == max(stride) else min_det_score + bbox_xyxy, cls_score = mx.sym.contrib.GenProposalRetina( + cls_prob=cls_prob, + bbox_pred=bbox_delta, + im_info=im_info, + anchors=anchors, + feature_stride=s, + anchor_mean=anchor_target_mean, + anchor_std=anchor_target_std, + num_anchors=num_anchors, + rpn_pre_nms_top_n=pre_nms_top_n, + rpn_min_size=min_bbox_side, + thresh=thresh_level, + batch_wise_anchor=True, + workspace=512, + name="proposal_pre_nms_stride%s" % s + ) + + bbox_xyxy_list.append(bbox_xyxy) + cls_score_list.append(cls_score) + + bbox_xyxy = X.concat(bbox_xyxy_list, axis=1, name="align_bbox_xyxy_concat") + cls_score = X.concat(cls_score_list, axis=1, name="align_cls_score_concat") + + return cls_score, bbox_xyxy + + def get_all_proposal(self, conv_feat, im_info, anchor_dict): + """ + anchors are the bboxes and scores of the previous stage, not of this stage. + """ + + p = self.p + batch_image = p.batch_image + stride = p.anchor_generate.stride + if not isinstance(stride, tuple): + stride = (stride, ) + anchor_target_mean = p.head.mean + anchor_target_std = p.head.std + ratios = p.anchor_generate.ratio + scales = p.anchor_generate.scale + num_base_anchor = len(ratios) * len(scales) + + cls_logit_dict, bbox_delta_dict = self.get_output(conv_feat) + + proposal_dict, proposal_score_dict = dict(), dict() + + for i, s in enumerate(stride): + cls_prob = X.sigmoid(data=cls_logit_dict["stride%s" % s]) + bbox_delta = bbox_delta_dict["stride%s" % s] + anchor = anchor_dict["stride%s" % s] + + # (N, A * 4, H, W) -> (N, A, 4, H * W) + bbox_delta = X.reshape( + data=bbox_delta, + shape=(0, num_base_anchor, 4, -1), + name="bbox_delta_reshape_stride%s" % s + ) + # (N, A, 4, H * W) -> (N, H * W, A, 4) + bbox_delta = X.transpose( + data=bbox_delta, + axes=(0, 3, 1, 2), + name="bbox_delta_reshape_transpose_stride%s" % s + ) + # (N, H * W, A, 4) -> (N, H * W * A, 4) + bbox_delta = X.reshape( + data=bbox_delta, + shape=(0, -1, 4), + name="bbox_delta_reshape_transpose_reshape_stride%s" % s + ) + + # decode anchor + bbox_delta_list = mx.sym.split(bbox_delta, num_outputs=batch_image, axis=0, squeeze_axis=False) + anchor_list = mx.sym.split(anchor, num_outputs=batch_image, axis=0, squeeze_axis=False) + im_info_list = mx.sym.split(im_info, num_outputs=batch_image, axis=0, squeeze_axis=False) + bbox_xyxy_list = list() + for bbox_delta_i, anchor_i, im_info_i in zip(bbox_delta_list, anchor_list, im_info_list): + pad_zero = mx.sym.zeros_like(bbox_delta_i) + bbox_delta_i = mx.sym.concat(pad_zero, bbox_delta_i, dim=-1) + bbox_xyxy_i = X.decode_bbox( + rois=anchor_i, + bbox_pred=bbox_delta_i, + im_info=im_info_i, + bbox_mean=anchor_target_mean, + bbox_std=anchor_target_std, + class_agnostic=True + ) + bbox_xyxy_list.append(bbox_xyxy_i) + bbox_xyxy = mx.sym.concat(*bbox_xyxy_list, dim=0, name="proposal_stride%s_%s" % (s, self.stage)) + + proposal_dict["stride%s" % s] = bbox_xyxy + proposal_score_dict["stride%s" % s] = cls_prob + + return proposal_dict, proposal_score_dict + + def get_proposal_and_label(self, conv_feat, gt_bbox, im_info, prev_anchor): + """ + Get proposal of this head with the proposal from the previous head. + Use proposal of this head to encode the training target of the next head. + """ + p = self.p + + mean = p.bbox_target.mean + std = p.bbox_target.std + class_agnostic = p.bbox_target.class_agnostic + short = p.anchor_generate.short + long = p.anchor_generate.long + stride = p.anchor_generate.stride + if not isinstance(stride, tuple): + stride = (stride, ) + num_anchors = len(p.anchor_generate.scale) * len(p.anchor_generate.ratio) + allowed_border = p.bbox_target.allowed_border + pos_thr = p.bbox_target.pos_thr + neg_thr = p.bbox_target.neg_thr + min_pos_thr = p.bbox_target.min_pos_thr + + anchor_dict, anchor_score_dict = self.get_all_proposal(conv_feat, im_info, prev_anchor) + + # custom op to encode new target + from models.aligndet import encode_anchor # noqa: F401 + + (label, bbox_target, bbox_weight) = mx.sym.Custom( + op_type="encode_anchor", + gt_boxes=gt_bbox, + im_info=im_info, + short=short, + long=long, + stride=stride, + num_anchors=num_anchors, + class_agnostic=class_agnostic, + allowed_border=allowed_border, + pos_thr=pos_thr, + neg_thr=neg_thr, + min_pos_thr=min_pos_thr, + mean=mean, + std=std, + name="encode_anchor", + **anchor_dict + ) + + return anchor_dict, label, bbox_target, bbox_weight + + def get_loss(self, conv_feat, cls_label, bbox_target, bbox_weight): + p = self.p + stride = p.anchor_generate.stride + if not isinstance(stride, tuple): + stride = (stride, ) + num_class = p.num_class + loss_weight = p.loss_weight + num_base_anchor = len(p.anchor_generate.ratio) * len(p.anchor_generate.scale) + stage = self.stage + + cls_logit_dict, bbox_delta_dict = self.get_output(conv_feat) + cls_logit_reshape_list = [] + bbox_delta_reshape_list = [] + + scale_loss_shift = 128.0 if p.fp16 else 1.0 + + # reshape logit and delta + for i, s in enumerate(stride): + # (N, A * C, H, W) -> (N, A, C, H * W) + cls_logit = X.reshape( + data=cls_logit_dict["stride%s" % s], + shape=(0, num_base_anchor, num_class - 1, -1), + name="align_cls_stride%s_reshape" % s + ) + # (N, A, C, H * W) -> (N, A, H * W, C) + cls_logit = X.transpose( + data=cls_logit, + axes=(0, 1, 3, 2), + name="align_cls_stride%s_transpose" % s + ) + # (N, A, H * W, C) -> (N, A * H * W, C) + cls_logit = X.reshape( + data=cls_logit, + shape=(0, -3, 0), + name="align_cls_stride%s_transpose_reshape" % s + ) + + # (N, A * 4, H, W) -> (N, A * 4, H * W) + bbox_delta = X.reshape( + data=bbox_delta_dict["stride%s" % s], + shape=(0, 0, -1), + name="align_bbox_stride%s_reshape" % s + ) + + cls_logit_reshape_list.append(cls_logit) + bbox_delta_reshape_list.append(bbox_delta) + + cls_logit_concat = X.concat( + cls_logit_reshape_list, + axis=1, + name="align_bbox_logit_concat" + ) + bbox_delta_concat = X.concat( + bbox_delta_reshape_list, + axis=2, + name="align_bbox_delta_concat" + ) + + # classification loss + cls_loss = X.focal_loss( + data=cls_logit_concat, + label=cls_label, + normalization='valid', + alpha=p.focal_loss.alpha, + gamma=p.focal_loss.gamma, + grad_scale=1.0 * loss_weight * scale_loss_shift, + workspace=1024, + name="align_cls_loss%s" % stage + ) + + scalar = 0.11 + # regression loss + bbox_norm = X.bbox_norm( + data=bbox_delta_concat - bbox_target, + label=cls_label, + name="align_bbox_norm" + ) + bbox_loss = bbox_weight * X.smooth_l1( + data=bbox_norm, + scalar=math.sqrt(1 / scalar), + name="align_bbox_loss" + ) + reg_loss = X.make_loss( + data=bbox_loss, + grad_scale=1.0 * loss_weight * scale_loss_shift, + name="align_reg_loss%s" % stage + ) + + cls_label = X.block_grad(cls_label, name="align_label_blockgrad%s" % stage) + + return cls_loss, reg_loss, cls_label + + +class AlignRoiExtractor(RoiExtractor): + def __init__(self, pRoi): + super(AlignRoiExtractor, self).__init__(pRoi) + + def get_roi_feature(self, feat_dict, anchor_dict): + p = self.p + stride = p.stride + if not isinstance(stride, tuple): + stride = (stride, ) + conv_channel = p.conv_channel + ratios = p.ratio + scales = p.scale + num_anchors = len(ratios) * len(scales) + sample_bins = p.sample_bins + stage = p.stage or "" + im2col = p.im2col or False + conv3d = p.conv3d or False + roialign = p.roialign or False + ignore_p3 = p.ignore_p3 or False + gauss_init = p.gauss_init or False + guided_anchor = p.guided_anchor or False + learned_offset = p.learned_offset or False + assert not (guided_anchor and learned_offset) + + if p.fp16: + for s in stride: + feat_dict["stride%s" % s] = X.to_fp32( + feat_dict["stride%s" % s], + name="feat_stride%s_to_fp32%s" % (s, stage) + ) + + anchor_feat_dict = {} + + for s in stride: + if ignore_p3 and s == 8: + old_sample_bins = sample_bins + old_conv_channel = conv_channel + sample_bins = 3 + conv_channel = old_conv_channel // (old_sample_bins ** 2) * (sample_bins ** 2) + + if guided_anchor: + # (N, H * W * A, 4) -> (N, A * K * K * 2, H, W) + (x1y1, x2y2) = mx.sym.split(anchor_dict["stride%s" % s], num_outputs=2, axis=-1) + hw = x2y2 - x1y1 # (N, H * W * A, 2) + hw = X.reshape(hw, [0, -1, 2 * num_anchors]) # (N, H * W, A * 2) + hw = X.transpose(hw, [0, 2, 1]) # (N, A * 2, H * W) + hw = mx.sym.reshape_like(hw, feat_dict["stride%s" % s], lhs_begin=2, lhs_end=None, rhs_begin=2, rhs_end=None) # (N, A * 2, H, W) + # normalize input hw + hw = mx.sym.BatchNorm(hw, fix_gamma=False, use_global_stats=False, name="guided_anchor_bn_stride%s" % s) + offset = X.conv(hw, name="guided_anchor_offset_stride%s" % s, filter=2 * num_anchors * sample_bins ** 2, init=X.gauss(0.01)) + elif learned_offset: + offset = X.conv(feat_dict["stride%s" % s], name="learned_offset_stride%s" % s, filter=2 * num_anchors * sample_bins ** 2, init=X.gauss(0.01)) + else: + # (N, H * W * A, 4) -> (N, A * K * K * 2, H, W) + offset = mx.sym.contrib.GetAnchorOffset( + data=feat_dict["stride%s" % s], + anchor=anchor_dict["stride%s" % s], + kernel=(sample_bins, sample_bins), + stride=s, + name="get_anchor_offset_stride%s%s" % (s, stage) + ) + + anchor_feat_list = [] + for anchor_idx in range(num_anchors): + offset_i = mx.sym.slice_axis( + offset, + begin=anchor_idx * (sample_bins * sample_bins * 2), + end=(anchor_idx + 1) * (sample_bins * sample_bins * 2), + axis=1, + name="anchor_offset_stride%s_slice%s%s" % (s, anchor_idx, stage) + ) + if im2col: + anchor_feat_i = mx.sym.contrib.DeformableConvolutionIm2Col( + data=feat_dict["stride%s" % s], + offset=offset_i, + kernel=(sample_bins, sample_bins), + pad=(sample_bins // 2, sample_bins // 2), + num_filter=conv_channel, + name="deform_im2col_stride%s_slice%s%s" % (s, anchor_idx, stage) + ) + + else: + if gauss_init: + w = mx.sym.var("deform_conv_stride%s_slice%s%s_weight" % (s, anchor_idx, stage), init=X.gauss(0.01)) + else: + w = None + anchor_feat_i = mx.sym.contrib.DeformableConvolution( + data=feat_dict["stride%s" % s], + offset=offset_i, + weight=w, + kernel=(sample_bins, sample_bins), + pad=(sample_bins // 2, sample_bins // 2), + num_filter=conv_channel, + num_deformable_group=1, + no_bias=False, + name="deform_conv_stride%s_slice%s%s" % (s, anchor_idx, stage) + ) + anchor_feat_list.append(anchor_feat_i) + anchor_feat = X.concat( + anchor_feat_list, + axis=1, + name="anchor_feat_concat_stride%s%s" % (s, stage) + ) + + anchor_feat = X.reshape(anchor_feat, shape=(-1, conv_channel, 0, 0)) + + if roialign: + # (N, H * W * A, 4) -> (N, H * W * A, C, bin, bin) + feat = feat_dict["stride%s" % s] + anchor = anchor_dict["stride%s" % s] + anchor_feat = X.roi_align(feat, anchor, out_size=sample_bins, stride=s, name="roialign_stride%s" % s) + anchor_feat = X.reshape(anchor_feat, [0, 0, -1]) + anchor_feat = X.transpose(anchor_feat, [0, 2, 1]) + anchor_feat = mx.sym.reshape_like(anchor_feat, feat, lhs_begin=2, lhs_end=None, rhs_begin=2, rhs_end=None) + + anchor_feat_dict["stride%s" % s] = anchor_feat + + if ignore_p3 and s == 8: + sample_bins = old_sample_bins + conv_channel = old_conv_channel + + if p.fp16: + for s in stride: + anchor_feat_dict["stride%s" % s] = X.to_fp16( + anchor_feat_dict["stride%s" % s], + name="anchor_feat_stride%s_to_fp16%s" % (s, stage) + ) + + return anchor_feat_dict + + def get_roi_feature_test(self, feat_dict, anchor_dict): + return self.get_roi_feature(feat_dict, anchor_dict) + + +class FlatRoiExtractor(RoiExtractor): + def __init__(self, pRoi): + super(FlatRoiExtractor, self).__init__(pRoi) + + def get_roi_feature(self, feat_dict, anchor_dict): + return feat_dict + + def get_roi_feature_test(self, feat_dict, anchor_dict): + return self.get_roi_feature(feat_dict, anchor_dict) + + +class CascadeRcnn(object): + def __init__(self): + pass + + @staticmethod + def get_train_symbol(backbone, neck, rpn_head, roi_extractor: AlignRoiExtractor, roi_extractor_2nd: AlignRoiExtractor, bbox_head: AlignHead, bbox_head_2nd: AlignHead, share_feat=False): + gt_bbox = X.var("gt_bbox") + im_info = X.var("im_info") + rpn_cls_label = X.var("rpn_cls_label") + rpn_reg_target = X.var("rpn_reg_target") + rpn_reg_weight = X.var("rpn_reg_weight") + + rpn_feat = backbone.get_rpn_feature() + rcnn_feat = backbone.get_rcnn_feature() + rpn_feat = neck.get_rpn_feature(rpn_feat) + rcnn_feat = neck.get_rcnn_feature(rcnn_feat) + + rpn_loss = rpn_head.get_loss(rpn_feat, rpn_cls_label, rpn_reg_target, rpn_reg_weight) + + # stage1 + proposal, bbox_cls, bbox_target, bbox_weight = \ + rpn_head.get_sampled_proposal( + rpn_feat, + gt_bbox, + im_info + ) + roi_feat = roi_extractor.get_roi_feature(rcnn_feat, proposal) + bbox_loss = bbox_head.get_loss( + roi_feat, + bbox_cls, + bbox_target, + bbox_weight + ) + + # stage2 + # though call get_sampled_proposal, bbox_head does not sample rois + proposal_2nd, bbox_cls_2nd, bbox_target_2nd, bbox_weight_2nd = \ + bbox_head.get_proposal_and_label( + roi_feat, + gt_bbox, + im_info, + proposal, + ) + if share_feat: + feat = rcnn_feat + else: + feat = bbox_head._head_feat_dict + roi_feat_2nd = roi_extractor_2nd.get_roi_feature(feat, proposal_2nd) + bbox_loss_2nd = bbox_head_2nd.get_loss( + roi_feat_2nd, + bbox_cls_2nd, + bbox_target_2nd, + bbox_weight_2nd + ) + + return X.group(rpn_loss + bbox_loss + bbox_loss_2nd) + + @staticmethod + def get_test_symbol(backbone, neck, rpn_head: AlignRetinaNetHead, roi_extractor: AlignRoiExtractor, roi_extractor_2nd: AlignRoiExtractor, bbox_head: AlignHead, bbox_head_2nd: AlignHead, stage=3, share_feat=False): + im_info = X.var("im_info") + im_id = X.var("im_id") + rec_id = X.var("rec_id") + + rpn_feat = backbone.get_rpn_feature() + rcnn_feat = backbone.get_rcnn_feature() + rpn_feat = neck.get_rpn_feature(rpn_feat) + rcnn_feat = neck.get_rcnn_feature(rcnn_feat) + + if stage == 1: + cls_score, bbox_xyxy = rpn_head.get_prediction(rpn_feat, im_info) + elif stage == 2: + proposal, proposal_score = rpn_head.get_all_proposal(rpn_feat, im_info) + roi_feat = roi_extractor.get_roi_feature(rcnn_feat, proposal) + cls_score, bbox_xyxy = bbox_head.get_prediction( + roi_feat, + im_info, + (proposal, proposal_score) + ) + elif stage == 3: + proposal, proposal_score = rpn_head.get_all_proposal(rpn_feat, im_info) + roi_feat = roi_extractor.get_roi_feature(rcnn_feat, proposal) + proposal_2nd, proposal_score_2nd = bbox_head.get_all_proposal( + roi_feat, + im_info, + proposal + ) + if share_feat: + feat = rcnn_feat + else: + feat = bbox_head._head_feat_dict + roi_feat_2nd = roi_extractor_2nd.get_roi_feature(feat, proposal_2nd) + cls_score, bbox_xyxy = bbox_head_2nd.get_prediction( + roi_feat_2nd, + im_info, + (proposal_2nd, proposal_score_2nd) + ) + else: + raise ValueError("No more stages") + + return X.group([rec_id, im_id, im_info, cls_score, bbox_xyxy]) diff --git a/models/aligndet/encode_anchor.py b/models/aligndet/encode_anchor.py new file mode 100644 index 0000000..07dabe8 --- /dev/null +++ b/models/aligndet/encode_anchor.py @@ -0,0 +1,185 @@ +""" +Encode boxes for anchors w.r.t matching gt_boxes +author: Chenxia Han + +input: + anchor: (N, H * W * A, 4) + gt_boxes: (N, MAX_NUM_GT, 5) + im_info: (N, 3) +output: + label: (N, \sum{A * H * W}) + target: (N, A * 4, \sum{H * W}) + weight: (N, A * 4, \sum{H * W}) +""" + +import mxnet as mx +import numpy as np + +from models.aligndet.input import AlignPyramidAnchorTarget2D + +class AnchorTarget2DParam: + def __init__(self, short, long, stride, mean, std, class_agnostic, + all_anchor_list, allowed_border, pos_thr, neg_thr, min_pos_thr): + self.mean = mean + self.std = std + self.class_agnostic = class_agnostic + + # input anchor + self.all_anchor_list = all_anchor_list + + self.generate.short = tuple(short) + self.generate.long = tuple(long) + self.generate.stride = tuple(stride) + self.assign.allowed_border = allowed_border + self.assign.pos_thr = pos_thr + self.assign.neg_thr = neg_thr + self.assign.min_pos_thr = min_pos_thr + + class generate: + short = None + long = None + stride = None + + class assign: + allowed_border = None + pos_thr = None + neg_thr = None + min_pos_thr = None + + +class EncodeAnchorOperator(mx.operator.CustomOp): + def __init__(self, short, long, stride, mean, std, class_agnostic, + allowed_border, pos_thr, neg_thr, min_pos_thr): + self.short = short + self.long = long + self.stride = stride + self.mean = mean + self.std = std + self.class_agnostic = class_agnostic + self.allowed_border = allowed_border + self.pos_thr = pos_thr + self.neg_thr = neg_thr + self.min_pos_thr = min_pos_thr + + def forward(self, is_train, req, in_data, out_data, aux): + anchor_list = in_data[:-2] + gt_boxes = in_data[-2] + im_info = in_data[-1] + + nbatch = in_data[0].shape[0] + + short = self.short + long = self.long + stride = self.stride + mean = self.mean + std = self.std + class_agnostic = self.class_agnostic + allowed_border = self.allowed_border + pos_thr = self.pos_thr + neg_thr = self.neg_thr + min_pos_thr = self.min_pos_thr + + label_list = [] + target_list = [] + weight_list = [] + + for i in range(nbatch): + anchor_list_np = [] + for anchor in anchor_list: + anchor_list_np.append(anchor[i].asnumpy()) + + anchor_param = AnchorTarget2DParam( + short, + long, + stride, + mean, + std, + class_agnostic, + anchor_list_np, + allowed_border, + pos_thr, + neg_thr, + min_pos_thr + ) + anchor_target_2d = AlignPyramidAnchorTarget2D(anchor_param) + input_record = {"im_info": im_info[i].asnumpy(), "gt_bbox": gt_boxes[i].asnumpy()} + anchor_target_2d.apply(input_record) + + label_list.append(mx.nd.array(input_record["rpn_cls_label"])) + target_list.append(mx.nd.array(input_record["rpn_reg_target"])) + weight_list.append(mx.nd.array(input_record["rpn_reg_weight"])) + + label = mx.nd.stack(*label_list, axis=0) + target = mx.nd.stack(*target_list, axis=0) + weight = mx.nd.stack(*weight_list, axis=0) + + self.assign(out_data[0], req[0], label) + self.assign(out_data[1], req[1], target) + self.assign(out_data[2], req[2], weight) + + def backward(self, req, out_grad, in_data, out_data, in_grad, aux): + num_input = len(in_data) + for i in range(num_input): + self.assign(in_grad[i], req[i], 0) + + +@mx.operator.register("encode_anchor") +class EncodeAnchorProp(mx.operator.CustomOpProp): + def __init__(self, short, long, stride, num_anchors, class_agnostic, + allowed_border, pos_thr, neg_thr, min_pos_thr, + mean="(0, 0, 0, 0)", std="(1, 1, 1, 1)"): + super(EncodeAnchorProp, self).__init__(need_top_grad=False) + self.short = eval(short) + self.long = eval(long) + self.stride = eval(stride) + self.num_anchors = eval(num_anchors) + self.class_agnostic = eval(class_agnostic) + self.allowed_border = int(allowed_border) + self.pos_thr = float(pos_thr) + self.neg_thr = float(neg_thr) + self.min_pos_thr = float(min_pos_thr) + self.mean = eval(mean) + self.std = eval(std) + + def list_arguments(self): + args_list = [] + for s in self.stride: + args_list.append("stride%s" % s) + args_list += ["gt_boxes", "im_info"] + + return args_list + + def list_outputs(self): + return ["label", "target", "weight"] + + def infer_shape(self, in_shape): + anchor_shape_list = in_shape[:-2] + gt_boxes_shape = in_shape[-2] + im_info_shape = in_shape[-1] + + nbatch = im_info_shape[0] + + assert(anchor_shape_list[0][2] == 4) + assert(gt_boxes_shape[2] == 5) + assert(im_info_shape[1] == 3) + assert(anchor_shape_list[0][0] == nbatch) + assert(gt_boxes_shape[0] == nbatch) + + num_anchors = self.num_anchors + total_anchors = np.sum([x[1] for x in anchor_shape_list]) + + label_shape = (nbatch, total_anchors) + target_shape = (nbatch, num_anchors * 4, total_anchors // num_anchors) + weight_shape = (nbatch, num_anchors * 4, total_anchors // num_anchors) + + return anchor_shape_list + [gt_boxes_shape, im_info_shape], \ + [label_shape, target_shape, weight_shape] + + def create_operator(self, ctx, shapes, dtypes): + return EncodeAnchorOperator(self.short, self.long, self.stride, + self.mean, self.std, self.class_agnostic, + self.allowed_border, self.pos_thr, + self.neg_thr, self.min_pos_thr) + + def declare_backward_dependency(self, out_grad, in_data, out_data): + return [] diff --git a/models/aligndet/input.py b/models/aligndet/input.py new file mode 100644 index 0000000..e53a46d --- /dev/null +++ b/models/aligndet/input.py @@ -0,0 +1,92 @@ +from __future__ import division +from __future__ import print_function + +import numpy as np + +from models.retinanet.input import PyramidAnchorTarget2DBase + +class AlignPyramidAnchorTarget2D(PyramidAnchorTarget2DBase): + """ + input: image_meta: tuple(h, w, scale) + gt_bbox, ndarry(max_num_gt, 4) + output: anchor_label, ndarray(num_anchor * h * w) + anchor_bbox_target, ndarray(num_anchor * 4, h * w) + anchor_bbox_weight, ndarray(num_anchor * 4, h * w) + """ + + def __init__(self, pAnchor): + super(AlignPyramidAnchorTarget2D, self).__init__(pAnchor) + + self.pyramid_levels = len(self.p.generate.stride) + + self.anchor_target_2d = PyramidAnchorTarget2DBase(self.p) + self.all_anchor_list = self.p.all_anchor_list + + self.anchor_target_2d.v_all_anchor = self.v_all_anchor + self.anchor_target_2d.h_all_anchor = self.h_all_anchor + + @property + def v_all_anchor(self): + anchors = np.concatenate(self.all_anchor_list) + return anchors + + @property + def h_all_anchor(self): + anchors = np.concatenate(self.all_anchor_list) + return anchors + + def apply(self, input_record): + anchor_size = [0] + [x.shape[0] for x in self.all_anchor_list] + anchor_size = np.cumsum(anchor_size) + cls_label, reg_target, reg_weight = \ + self.anchor_target_2d.apply(input_record) + + im_info = input_record["im_info"] + h, w = im_info[:2] + + mean = np.array(self.p.mean) + std = np.array(self.p.std) + + cls_label_list = [] + reg_target_list = [] + reg_weight_list = [] + for i in range(self.pyramid_levels): + p = self.p + + cls_label_level = cls_label[anchor_size[i]:anchor_size[i+1]] + reg_target_level = reg_target[anchor_size[i]:anchor_size[i+1]] + reg_weight_level = reg_weight[anchor_size[i]:anchor_size[i+1]] + """ + label: (h * w * A) -> (A * h * w) + bbox_target: (h * w * A, 4) -> (A * 4, h * w) + bbox_weight: (h * w * A, 4) -> (A * 4, h * w) + """ + if h >= w: + fh, fw = p.generate.long[i], p.generate.short[i] + else: + fh, fw = p.generate.short[i], p.generate.long[i] + + reg_target_level = (reg_target_level - mean) / std + + cls_label_level = cls_label_level.reshape((fh, fw, -1)).transpose(2, 0, 1).reshape(-1) + reg_target_level = reg_target_level.reshape((fh, fw, -1)).transpose(2, 0, 1) + reg_weight_level = reg_weight_level.reshape((fh, fw, -1)).transpose(2, 0, 1) + + reg_target_level = reg_target_level.reshape(-1, fh * fw) + reg_weight_level = reg_weight_level.reshape(-1, fh * fw) + + cls_label_list.append(cls_label_level) + reg_target_list.append(reg_target_level) + reg_weight_list.append(reg_weight_level) + + cls_label = np.concatenate(cls_label_list, axis=0) + reg_target = np.concatenate(reg_target_list, axis=1) + reg_weight = np.concatenate(reg_weight_list, axis=1) + + input_record["rpn_cls_label"] = cls_label + input_record["rpn_reg_target"] = reg_target + input_record["rpn_reg_weight"] = reg_weight + + return input_record["rpn_cls_label"], \ + input_record["rpn_reg_target"], \ + input_record["rpn_reg_weight"] diff --git a/operator_cxx/contrib/deformable_convolution_im2col-inl.h b/operator_cxx/contrib/deformable_convolution_im2col-inl.h new file mode 100644 index 0000000..49cbe0a --- /dev/null +++ b/operator_cxx/contrib/deformable_convolution_im2col-inl.h @@ -0,0 +1,477 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_convolution-inl.h + * \brief + * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo + * \ref: https://arxiv.org/abs/1703.06211 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Yuntao Chen +*/ +#ifndef MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_IM2COL_INL_H_ +#define MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_IM2COL_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../nn/im2col.h" +#include "./nn/deformable_im2col.h" +#include "../linalg.h" + + +namespace mxnet { +namespace op { + +namespace conv { + enum DeformableConvolutionIm2ColOpInputs { kData, kOffset }; + enum DeformableConvolutionIm2ColOpOutputs { kOut }; + enum DeformableConvolutionIm2ColOpResource { kTempSpace }; +} + +struct DeformableConvolutionIm2ColParam : public dmlc::Parameter { + mxnet::TShape kernel; + mxnet::TShape stride; + mxnet::TShape dilate; + mxnet::TShape pad; + uint32_t num_filter; + uint32_t num_group; + uint32_t num_deformable_group; + uint64_t workspace; + bool no_bias; + dmlc::optional layout; + DMLC_DECLARE_PARAMETER(DeformableConvolutionIm2ColParam) { + DMLC_DECLARE_FIELD(kernel).describe("Convolution kernel size: (h, w) or (d, h, w)"); + DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, -1)) + .describe("Convolution stride: (h, w) or (d, h, w). Defaults to 1 for each dimension."); + DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape(0, -1)) + .describe("Convolution dilate: (h, w) or (d, h, w). Defaults to 1 for each dimension."); + DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, -1)) + .describe("Zero pad for convolution: (h, w) or (d, h, w). Defaults to no padding."); + DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000) + .describe("Convolution filter(channel) number"); + DMLC_DECLARE_FIELD(num_group).set_default(1) + .describe("Number of group partitions."); + DMLC_DECLARE_FIELD(num_deformable_group).set_default(1) + .describe("Number of deformable group partitions."); + DMLC_DECLARE_FIELD(workspace).set_default(1024).set_range(0, 8192) + .describe("Maximum temperal workspace allowed for convolution (MB)."); + DMLC_DECLARE_FIELD(no_bias).set_default(false) + .describe("Whether to disable bias parameter."); + DMLC_DECLARE_FIELD(layout) + .add_enum("NCW", mshadow::kNCW) + .add_enum("NCHW", mshadow::kNCHW) + .add_enum("NCDHW", mshadow::kNCDHW) + .set_default(dmlc::optional()) + .describe("Set layout for input, output and weight. Empty for\n " + "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d."); + } +}; + +template +class DeformableConvolutionIm2ColOp : public Operator { + public: + explicit DeformableConvolutionIm2ColOp(DeformableConvolutionIm2ColParam p) { + this->param_ = p; + // convert MBytes first to Bytes and then to elements. + param_.workspace = (param_.workspace << 20) / sizeof(DType); + CHECK(param_.layout.value() == mshadow::kNCW || + param_.layout.value() == mshadow::kNCHW || + param_.layout.value() == mshadow::kNCDHW) + << "Only support NCW, NCHW and NCDHW layout"; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(req[conv::kOut], kWriteTo); + CHECK_EQ(in_data.size(), 2U); + CHECK_EQ(out_data.size(), 1U); + LayerSetUp(in_data[conv::kData].shape_, + in_data[conv::kOffset].shape_, + in_data[conv::kData].shape_); + Stream* s = ctx.get_stream(); + // allocate workspace for col_buffer + Tensor workspace = ctx.requested[conv::kTempSpace] + .get_space_typed(Shape1(col_buffer_size_), s); + // calculate the shape of col_buffer + mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, -1); + col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size(); + for (size_t i = 1; i < col_buffer_shape.ndim(); ++i) { + col_buffer_shape[i] = in_data[conv::kData].shape_[i + 1]; + } + // create a column buffer using workspace and col_buffer_shape + TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); + + // initialize weight and col_buffer 3D tensors for using gemm + // index_t M = conv_out_channels_ / group_; + index_t N = conv_out_spatial_dim_; + index_t K = kernel_dim_; + Tensor col_buffer_2d = col_buffer.get_with_shape( + Shape2(K, N), s); + Tensor output_3d = out_data[conv::kOut].get_with_shape( + Shape3(num_, K, N), s); + + for (index_t n = 0; n < num_; ++n) { + // transform image to col_buffer in order to use gemm + deformable_im2col( + s, + in_data[conv::kData].dptr() + n*input_dim_, + in_data[conv::kOffset].dptr() + n*input_offset_dim_, + in_data[conv::kData].shape_, + col_buffer.shape_, + param_.kernel, + param_.pad, + param_.stride, + param_.dilate, + param_.num_deformable_group, + col_buffer.dptr()); + Copy(output_3d[n], col_buffer_2d, s); + } + } + + virtual void Backward(const OpContext &ctx, + const std::vector& out_grad, + const std::vector& in_data, + const std::vector& out_data, + const std::vector& req, + const std::vector& in_grad, + const std::vector& aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1U); + size_t expected = 2; + CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(req.size(), expected); + LayerSetUp(in_grad[conv::kData].shape_, + in_grad[conv::kOffset].shape_, + in_grad[conv::kData].shape_); + Stream *s = ctx.get_stream(); + // allocate workspace for col_buffer + Tensor workspace = ctx.requested[conv::kTempSpace] + .get_space_typed(Shape1(col_buffer_size_), s); + // calculate the shape of col_buffer + mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, -1); + col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size(); + for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) { + col_buffer_shape[i] = in_grad[conv::kData].shape_[i + 1]; + } + + // create a column buffer using workspace and col_buffer_shape + TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType::kFlag); + + // initialize weight and col_buffer 3D tensors for using gemm + // For computing dLoss/d(in_data[kData]) + index_t M = kernel_dim_; + index_t N = conv_out_spatial_dim_; + // index_t K = conv_out_channels_ / group_; + Tensor out_grad_3d = out_grad[conv::kOut].get_with_shape( + Shape3(num_, M, N), s); + Tensor col_buffer_2d = col_buffer.get_with_shape( + Shape2(M, N), s); + // For computing dLoss + Tensor data_grad = in_grad[conv::kData].FlatTo1D(s); + data_grad = 0; + Tensor offset_grad = in_grad[conv::kOffset].FlatTo1D(s); + offset_grad = 0; + + for (index_t n = 0; n < num_; ++n) { + Tensor out_grad_2d = out_grad_3d[n]; + Copy(col_buffer_2d, out_grad_2d, s); + + // gradient w.r.t. input data + deformable_col2im( + s, + col_buffer.dptr(), + in_data[conv::kOffset].dptr() + n*input_offset_dim_, + in_grad[conv::kData].shape_, + col_buffer.shape_, + param_.kernel, + param_.pad, + param_.stride, + param_.dilate, + param_.num_deformable_group, + in_grad[conv::kData].dptr() + n*input_dim_); + } + } + + private: + void LayerSetUp(const mxnet::TShape& ishape, + const mxnet::TShape& offset_shape, + const mxnet::TShape& oshape) { + channel_axis_ = 1; // hard code channel axis + const index_t first_spatial_axis = channel_axis_ + 1; + const index_t num_axes = param_.kernel.ndim() + 2; + num_spatial_axes_ = num_axes - first_spatial_axis; + is_1x1_ = true; + for (index_t i = 0; i < param_.kernel.ndim(); ++i) { + is_1x1_ &= param_.kernel[i] == 1 && param_.stride[i] == 1 && param_.pad[i] == 0; + if (!is_1x1_) break; + } + + // batch size + num_ = ishape[0]; + // number of input channels + channels_ = ishape[1]; + group_ = param_.num_group; + conv_out_channels_ = channels_; + conv_in_channels_ = channels_; + kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size(); + weight_offset_ = conv_out_channels_ * kernel_dim_ / group_; + conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim()); + col_offset_ = kernel_dim_ * conv_out_spatial_dim_; + output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_; + // size of the column buffer used for storing im2col-ed pixels + col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_; + // input/output image size (#channels * height * width) + input_dim_ = ishape.ProdShape(1, ishape.ndim()); + input_offset_dim_ = offset_shape.ProdShape(1, offset_shape.ndim()); + output_dim_ = oshape.ProdShape(1, oshape.ndim()); + num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_; + num_kernels_col2im_ = input_dim_; + } + + private: + DeformableConvolutionIm2ColParam param_; + index_t channel_axis_; // channel axis of the input + index_t channels_; // number of channels of input image + index_t num_spatial_axes_; // number of spatial axes + index_t num_; // batch size + index_t group_; // number of groups + index_t conv_out_channels_; // number of output channels (num_filter) + index_t conv_out_spatial_dim_; // number of pixels of output images per channel + index_t conv_in_channels_; // number of input channels + index_t kernel_dim_; // number of input channels per group * kernel size + index_t weight_offset_; // number of output channels per group * kernel_dim_ + index_t col_offset_; + index_t output_offset_; + index_t col_buffer_size_; + index_t input_dim_; + index_t input_offset_dim_; + index_t output_dim_; + index_t num_kernels_im2col_; + index_t num_kernels_col2im_; + bool is_1x1_; +}; // class ConvolutionOp + +template +Operator* CreateOp(DeformableConvolutionIm2ColParam param, int dtype, + mxnet::ShapeVector *in_shape, + mxnet::ShapeVector *out_shape, + Context ctx); + +#if DMLC_USE_CXX11 +class DeformableConvolutionIm2ColProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + return{ "data", "offset" }; + } + + void Init(const std::vector >& kwargs) override { + using namespace mshadow; + param_.Init(kwargs); + if (param_.kernel.ndim() == 2) { + param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + } else { + LOG(FATAL) << "not implemented"; + } + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(mxnet::ShapeVector *in_shape, + mxnet::ShapeVector *out_shape, + mxnet::ShapeVector *aux_shape) const override { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, offset]"; + out_shape->resize(1, mxnet::TShape()); + const mxnet::TShape &dshp = (*in_shape)[conv::kData]; + const mxnet::TShape &oshp = (*in_shape)[conv::kOffset]; + if (dshp.ndim() == 0) return false; + if (param_.kernel.ndim() == 2) { + // 2d conv + CHECK_EQ(dshp.ndim(), 4U) \ + << "Input data should be 4D in batch-num_filter-y-x"; + CHECK_EQ(oshp.ndim(), 4U) \ + << "Input offset should be 4D in batch-num_filter-y-x"; + Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW); + Shape<4> offsetshape = ConvertLayout(oshp.get<4>(), param_.layout.value(), kNCHW); + // Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, + // param_.kernel[0], param_.kernel[1]); + // wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); + // wshape[0] *= param_.num_group; + + const index_t ksize_y = static_cast(param_.kernel[0]); + const index_t ksize_x = static_cast(param_.kernel[1]); + CHECK_EQ(dshape[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(dshape[1] % param_.num_deformable_group, 0U) \ + << "input num_filter must divide deformable group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + // Shape<4> oshape; + // oshape[0] = dshape[0]; + // oshape[1] = param_.num_filter; + // oshape[2] = (dshape[2] + 2 * param_.pad[0] - + // (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1; + // oshape[3] = (dshape[3] + 2 * param_.pad[1] - + // (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1; + // SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); + // CHECK_EQ(oshape[1] % param_.num_deformable_group, 0U) \ + // << "output num_filter must divide deformable group size"; + // CHECK_EQ(oshape[2], offsetshape[2]) \ + // << "output height must equal to offset map height"; + // CHECK_EQ(oshape[3], offsetshape[3]) \ + // << "output width must equal to offset map width"; + // CHECK_EQ(offsetshape[1] % (param_.kernel[0] * param_.kernel[1]), 0U) \ + // << "offset filter must divide deformable group size"; + // CHECK_EQ(offsetshape[1] / (2 * param_.kernel[0] * param_.kernel[1]), \ + // param_.num_deformable_group) \ + // << "offset filter must divide deformable group size"; + Shape<4> oshape; + oshape[0] = dshape[0]; + oshape[1] = dshape[1] * param_.kernel.Size(); + oshape[2] = ((dshape[2] + 2 * param_.pad[0] - (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1); + oshape[3] = ((dshape[3] + 2 * param_.pad[1] - (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1); + SHAPE_ASSIGN_CHECK(*out_shape, 0, oshape); + // CHECK_EQ(oshape[1] % param_.num_deformable_group, 0U) \ + // << "output num_filter must divide deformable group size"; + // CHECK_EQ(oshape[2], offsetshape[2]) \ + // << "output height must equal to offset map height"; + // CHECK_EQ(oshape[3], offsetshape[3]) \ + // << "output width must equal to offset map width"; + CHECK_EQ(offsetshape[1] % (param_.kernel[0] * param_.kernel[1]), 0U) \ + << "offset filter must divide deformable group size"; + CHECK_EQ(offsetshape[1] / (2 * param_.kernel[0] * param_.kernel[1]), \ + param_.num_deformable_group) \ + << "offset filter must divide deformable group size"; + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input height/width if the corresponding stride is 1. + // oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW); + // dshape[0] = oshape[0]; + // if (param_.stride[0] == 1) { + // dshape[2] = oshape[2] + param_.dilate[0] * (ksize_y - 1) - 2 * param_.pad[0]; + // } + // if (param_.stride[1] == 1) { + // dshape[3] = oshape[3] + param_.dilate[1] * (ksize_x - 1) - 2 * param_.pad[1]; + // } + // SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + // ConvertLayout(dshape, kNCHW, param_.layout.value())); + // // Check whether the kernel sizes are valid + // if (dshape[2] != 0) { + // CHECK_LE(ksize_y, dshape[2] + 2 * param_.pad[0]) << "kernel size exceed input"; + // } + // if (dshape[3] != 0) { + // CHECK_LE(ksize_x, dshape[3] + 2 * param_.pad[1]) << "kernel size exceed input"; + // } + return true; + } else { + LOG(FATAL) << "not implemented"; + return false; + } + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (size_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); + } + } + out_type->clear(); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new DeformableConvolutionIm2ColProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "_contrib_DeformableConvolutionIm2Col"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return{ out_grad[conv::kOut], in_data[conv::kData], + in_data[conv::kOffset] }; + } + + std::vector ForwardResource( + const mxnet::ShapeVector &in_shape) const override { + return{ ResourceRequest::kTempSpace }; + } + + std::vector BackwardResource( + const mxnet::ShapeVector &in_shape) const override { + return{ ResourceRequest::kTempSpace }; + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape, + std::vector *in_type) const override; + + private: + DeformableConvolutionIm2ColParam param_; +}; // class ConvolutionProp +#endif // DMLC_USE_CXX11 +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_IM2COL_INL_H_ diff --git a/operator_cxx/contrib/deformable_convolution_im2col.cc b/operator_cxx/contrib/deformable_convolution_im2col.cc new file mode 100644 index 0000000..30613dc --- /dev/null +++ b/operator_cxx/contrib/deformable_convolution_im2col.cc @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_convolution_im2col.cc + * \brief + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Chenxia Han, Yuntao Chen +*/ + +#include "./deformable_convolution_im2col-inl.h" + +namespace mxnet { +namespace op { +DMLC_REGISTER_PARAMETER(DeformableConvolutionIm2ColParam); + +template<> +Operator* CreateOp(DeformableConvolutionIm2ColParam param, int dtype, + mxnet::ShapeVector *in_shape, + mxnet::ShapeVector *out_shape, + Context ctx) { + Operator *op = nullptr; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DeformableConvolutionIm2ColOp(param); + }) + return op; +} + +// DO_BIND_DISPATCH comes from operator_common.h +Operator *DeformableConvolutionIm2ColProp::CreateOperatorEx(Context ctx, + mxnet::ShapeVector *in_shape, + std::vector *in_type) const { + mxnet::ShapeVector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx); +} + +MXNET_REGISTER_OP_PROPERTY(_contrib_DeformableConvolutionIm2Col, DeformableConvolutionIm2ColProp) +.describe(R"code(Compute 2-D deformable convolution on 4-D input. + +The deformable convolution operation is described in https://arxiv.org/abs/1703.06211 + +For 2-D deformable convolution, the shapes are + +- **data**: *(batch_size, channel, height, width)* +- **offset**: *(batch_size, num_deformable_group * kernel[0] * kernel[1], height, width)* +- **out**: *(batch_size, channel * kernel[0] * kernel[1], out_height * out_width)*. + +Define:: + + f(x,k,p,s,d) = floor((x+2*p-d*(k-1)-1)/s)+1 + +then we have:: + + out_height=f(height, kernel[0], pad[0], stride[0], dilate[0]) + out_width=f(width, kernel[1], pad[1], stride[1], dilate[1]) + +If ``no_bias`` is set to be true, then the ``bias`` term is ignored. + +The default data ``layout`` is *NCHW*, namely *(batch_size, channle, height, +width)*. + +If ``num_group`` is larger than 1, denoted by *g*, then split the input ``data`` +evenly into *g* parts along the channel axis, and also evenly split ``weight`` +along the first dimension. Next compute the convolution on the *i*-th part of +the data with the *i*-th weight part. The output is obtained by concating all +the *g* results. + +If ``num_deformable_group`` is larger than 1, denoted by *dg*, then split the +input ``offset`` evenly into *dg* parts along the channel axis, and also evenly +split ``out`` evenly into *dg* parts along the channel axis. Next compute the +deformable convolution, apply the *i*-th part of the offset part on the *i*-th +out. + + +Both ``weight`` and ``bias`` are learnable parameters. + + +)code" ADD_FILELINE) +.add_argument("data", "NDArray-or-Symbol", "Input data to the DeformableConvolutionIm2ColOp.") +.add_argument("offset", "NDArray-or-Symbol", "Input offset to the DeformableConvolutionIm2ColOp.") +.add_arguments(DeformableConvolutionIm2ColParam::__FIELDS__()); + +} // namespace op +} // namespace mxnet diff --git a/operator_cxx/contrib/deformable_convolution_im2col.cu b/operator_cxx/contrib/deformable_convolution_im2col.cu new file mode 100644 index 0000000..f054c22 --- /dev/null +++ b/operator_cxx/contrib/deformable_convolution_im2col.cu @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2017 Microsoft + * Licensed under The Apache-2.0 License [see LICENSE for details] + * \file deformable_convolution_im2col.cu + * \brief + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Chenxia Han, Yuntao Chen +*/ + +#include "./deformable_convolution_im2col-inl.h" +#include + +namespace mxnet { +namespace op { + + template<> + Operator* CreateOp(DeformableConvolutionIm2ColParam param, int dtype, + mxnet::ShapeVector *in_shape, + mxnet::ShapeVector *out_shape, + Context ctx) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DeformableConvolutionIm2ColOp(param); + }) + return op; + } + +} // namespace op +} // namespace mxnet + diff --git a/operator_cxx/contrib/get_anchor_offset-inl.h b/operator_cxx/contrib/get_anchor_offset-inl.h new file mode 100644 index 0000000..62238e5 --- /dev/null +++ b/operator_cxx/contrib/get_anchor_offset-inl.h @@ -0,0 +1,277 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file get_anchor_offset-inl.h + * \brief GetAnchorOffset Operator + * \author Chenxia Han +*/ +#ifndef MXNET_OPERATOR_CONTRIB_GET_ANCHOR_OFFSET_INL_H_ +#define MXNET_OPERATOR_CONTRIB_GET_ANCHOR_OFFSET_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../mshadow_op.h" + +namespace mxnet { +namespace op { + +namespace get_anchor_offset { +enum GetAnchorOffsetOpInputs {kData, kAnchor}; +enum GetAnchorOffsetOpOutputs {kOut}; +} // get_anchor_offset + +struct GetAnchorOffsetParam : public dmlc::Parameter { + mxnet::TShape kernel; + int stride; + DMLC_DECLARE_PARAMETER(GetAnchorOffsetParam) { + DMLC_DECLARE_FIELD(kernel).describe("Sample size for each anchor: (h, w)"); + DMLC_DECLARE_FIELD(stride).describe("Stride at current layer"); + } +}; + +struct anchor_to_offset { + template + MSHADOW_XINLINE static void Map(int i, int k1, int k2, int stride, + int num_anchors, int height, int width, + const DType *anchor, DType *out) { + int w = i % width; + int h = i / width % height; + int a = i / width / height % num_anchors; + int n = i / width / height / num_anchors; + + int num_offset = num_anchors * k1 * k2 * 2; + + for (int kh = 0; kh < k1; ++kh) { + for (int kw = 0; kw < k2; ++kw) { + const DType *box = anchor + ((((n * height) + h) * width + w) * num_anchors + a) * 4; + DType x1 = box[0] / stride; + DType y1 = box[1] / stride; + DType x2 = box[2] / stride; + DType y2 = box[3] / stride; + + DType bin_size_x = (x2 - x1 + 1) / k2; + DType bin_size_y = (y2 - y1 + 1) / k1; + + int offset_idx = ((a * k1 + kh) * k2 + kw) * 2; + int offset_idx_x = ((n * num_offset + offset_idx + 1) * height + h) * width + w; + int offset_idx_y = ((n * num_offset + offset_idx + 0) * height + h) * width + w; + + out[offset_idx_x] = x1 + (bin_size_x-1) / 2 + kw * bin_size_x; + out[offset_idx_y] = y1 + (bin_size_y-1) / 2 + kh * bin_size_y; + + out[offset_idx_x] -= w + kw - (k2-1) / 2; + out[offset_idx_y] -= h + kh - (k1-1) / 2; + } + } + } +}; + +template +class GetAnchorOffsetOp : public Operator { + public: + explicit GetAnchorOffsetOp(GetAnchorOffsetParam param) { + this->param_ = param; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace mxnet_op; + CHECK_EQ(in_data.size(), 2U); + CHECK_EQ(out_data.size(), 1U); + CHECK_EQ(req.size(), 1U); + CHECK_EQ(req[get_anchor_offset::kOut], kWriteTo); + + Stream *s = ctx.get_stream(); + + /* + * data: (n, c, h, w) + * anchor: (n, h * w * a, 4) + * out: (n, a * k_1 * k_2 * 2, h, w) + */ + Tensor data = in_data[get_anchor_offset::kData].get(s); + Tensor anchor = in_data[get_anchor_offset::kAnchor].get(s); + Tensor out = out_data[get_anchor_offset::kOut].get(s); + + int height = data.size(2); + int width = data.size(3); + int num_anchors = anchor.size(1) / height / width; + int count = anchor.shape_.ProdShape(0, 2); + + Kernel::Launch(s, count, param_.kernel[0], param_.kernel[1], + param_.stride, num_anchors, height, width, anchor.dptr_, out.dptr_); + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_grad.size(), 2U); + + Stream *s = ctx.get_stream(); + Tensor gdata = in_grad[get_anchor_offset::kData].get(s); + Tensor ganchor = in_grad[get_anchor_offset::kAnchor].get(s); + + Assign(gdata, req[get_anchor_offset::kData], 0); + Assign(ganchor, req[get_anchor_offset::kAnchor], 0); + } + + private: + GetAnchorOffsetParam param_; +}; // class GetAnchorOffsetOp + +template +Operator *CreateOp(GetAnchorOffsetParam param, int dtype); + +#if DMLC_USE_CXX11 +class GetAnchorOffsetProp : public OperatorProperty { + public: + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 2) << "Input:[data, anchor]"; + const TShape &dshape = in_shape->at(get_anchor_offset::kData); + const TShape &ashape = in_shape->at(get_anchor_offset::kAnchor); + + const int num_image = dshape[0]; + const int channel = dshape[1]; + const int height = dshape[2]; + const int width = dshape[3]; + const int num_anchors = ashape[1] / (height * width); + const TShape &kernel = param_.kernel; + + auto data_shape = Shape4(num_image, channel, height, width); + auto anchor_shape = Shape3(num_image, height * width * num_anchors, 4); + auto offset_shape = Shape4(num_image, num_anchors * kernel[0] * kernel[1] * 2, height, width); + + SHAPE_ASSIGN_CHECK(*in_shape, get_anchor_offset::kData, data_shape); + SHAPE_ASSIGN_CHECK(*in_shape, get_anchor_offset::kAnchor, anchor_shape); + + out_shape->clear(); + // output + out_shape->push_back(offset_shape); + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_EQ(in_type->size(), 2U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (size_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); + } + } + out_type->clear(); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new GetAnchorOffsetProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "_contrib_GetAnchorOffset"; + } + + int NumOutputs() const override { + return 1; + } + + std::vector ListArguments() const override { + return {"data", "anchor"}; + } + + std::vector ListOutputs() const override { + return {"output"}; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {}; + } + + /* + std::vector BackwardResource( + const std::vector &in_shape) const override { + return {ResourceRequest::kTempSpace}; + } + + std::vector > ForwardInplaceOption( + const std::vector &in_data, + const std::vector &out_data) const override { + return {{in_data[get_anchor_offset::kData], out_data[get_anchor_offset::kOut]}}; + } + */ + + Operator *CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator *CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + GetAnchorOffsetParam param_; +}; // class GetAnchorOffsetProp + +#endif // DMLC_USE_CXX11 +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_CONTRIB_GET_ANCHOR_OFFSET_INL_H_ diff --git a/operator_cxx/contrib/get_anchor_offset.cc b/operator_cxx/contrib/get_anchor_offset.cc new file mode 100644 index 0000000..ca1b70b --- /dev/null +++ b/operator_cxx/contrib/get_anchor_offset.cc @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file get_anchor_offset.cc + * \brief + * \author Chenxia Han +*/ + +#include "./get_anchor_offset-inl.h" + +namespace mxnet { +namespace op { + +template<> +Operator *CreateOp(GetAnchorOffsetParam param, int dtype) { + Operator *op = nullptr; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new GetAnchorOffsetOp(param); + }); + return op; +} + +Operator *GetAnchorOffsetProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); +} + +DMLC_REGISTER_PARAMETER(GetAnchorOffsetParam); + +MXNET_REGISTER_OP_PROPERTY(_contrib_GetAnchorOffset, GetAnchorOffsetProp) +.describe("Compute offset for Deformable Convolution") +.add_argument("data", "NDArray-or-Symbol", "Data to determine height and width") +.add_argument("anchor", "NDArray-or-Symbol", "Anchor to determine offset") +.add_arguments(GetAnchorOffsetParam::__FIELDS__()); + +} // namespace op +} // namespace mxnet diff --git a/operator_cxx/contrib/get_anchor_offset.cu b/operator_cxx/contrib/get_anchor_offset.cu new file mode 100644 index 0000000..41ce243 --- /dev/null +++ b/operator_cxx/contrib/get_anchor_offset.cu @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file get_anchor_offset.cu + * \brief + * \author Chenxia Han +*/ + +#include "./get_anchor_offset-inl.h" + +namespace mxnet { +namespace op { + +template<> +Operator *CreateOp(GetAnchorOffsetParam param, int dtype) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new GetAnchorOffsetOp(param); + }); + return op; +} + +} // namespace op +} // namespace mxnet