diff --git a/config/aligndet_r101v1_fpn_fp16_1x.py b/config/aligndet_r101v1_fpn_fp16_1x.py
new file mode 100644
index 0000000..816d94d
--- /dev/null
+++ b/config/aligndet_r101v1_fpn_fp16_1x.py
@@ -0,0 +1,292 @@
+from symbol.builder import FasterRcnn as Detector
+from models.retinanet.builder import MSRAResNet101V1FPN as Backbone
+from models.retinanet.builder import RetinaNetNeck as Neck
+from models.aligndet.builder import AlignRetinaNetHead as RpnHead
+from models.aligndet.builder import AlignRoiExtractor as RoiExtractor
+from models.aligndet.builder import AlignHead as BboxHead
+from mxnext.complicate import normalizer_factory
+
+
+def get_config(is_train):
+    class General:
+        log_frequency = 10
+        name = __name__.rsplit("/")[-1].rsplit(".")[-1]
+        batch_image = 2 if is_train else 1
+        fp16 = True
+
+
+    class KvstoreParam:
+        kvstore     = "local"
+        batch_image = General.batch_image
+        gpus        = [0, 1, 2, 3, 4, 5, 6, 7]
+        fp16        = General.fp16
+
+
+    class NormalizeParam:
+        normalizer = normalizer_factory(type="fixbn")
+
+
+    class BackboneParam:
+        fp16 = General.fp16
+        normalizer = NormalizeParam.normalizer
+
+
+    class NeckParam:
+        fp16 = General.fp16
+        normalizer = NormalizeParam.normalizer
+
+
+    class RpnParam:
+        fp16 = General.fp16
+        normalizer = NormalizeParam.normalizer
+        batch_image = General.batch_image
+        num_class   = 1 + 80
+        loss_weight = 1.0
+
+        class anchor_generate:
+            scale = (4 * 2 ** (1.0 / 3.0),)
+            ratio = (1.0,)
+            stride = (8, 16, 32, 64, 128)
+            short = (100, 50, 25, 13, 7)
+            long = (167, 84, 42, 21, 11)
+            image_anchor = None
+
+        class head:
+            conv_channel = 256
+            mean = (0.0, 0.0, 0.0, 0.0)
+            std = (1.0, 1.0, 1.0, 1.0)
+
+        class focal_loss:
+            alpha = 0.25
+            gamma = 2.0
+
+        class proposal:
+            pre_nms_top_n = 1000
+            min_bbox_side = 0
+            min_det_score = 0.05
+
+        class subsample_proposal:
+            pass
+
+        class bbox_target:
+            class_agnostic = False
+            mean = (0.0, 0.0, 0.0, 0.0)
+            std = (1.0, 1.0, 1.0, 1.0)
+            allowed_border = 9999
+            pos_thr = 0.7
+            neg_thr = 0.7
+            min_pos_thr = 0.0
+
+
+    class BboxParam:
+        fp16        = General.fp16
+        normalizer  = NormalizeParam.normalizer
+        num_class   = 1 + 80
+        loss_weight = 1.0
+
+        class anchor_generate:
+            scale = (4 * 2 ** (1.0 / 3.0),)
+            ratio = (1.0,)
+            stride = (8, 16, 32, 64, 128)
+
+        class head:
+            merge_score = False
+            num_conv = 2
+            use_1x1 = True
+            conv_channel = 1024
+            mean = (0.0, 0.0, 0.0, 0.0)
+            std = (1.0, 1.0, 1.0, 1.0)
+
+        class focal_loss:
+            alpha = 0.25
+            gamma = 2.0
+
+        class proposal:
+            pre_nms_top_n = 1000
+            min_bbox_side = 0
+            min_det_score = 0.05
+
+
+    class RoiParam:
+        fp16 = General.fp16
+        normalizer = NormalizeParam.normalizer
+        sample_bins = 7
+        im2col = True
+        stride = (8, 16, 32, 64, 128)
+        conv_channel = 256 * 7 * 7
+        scale = (4 * 2 ** (1.0 / 3.0),)
+        ratio = (1.0,)
+
+
+    class DatasetParam:
+        if is_train:
+            image_set = ("coco_train2014", "coco_valminusminival2014")
+        else:
+            image_set = ("coco_minival2014", )
+
+    backbone = Backbone(BackboneParam)
+    neck = Neck(NeckParam)
+    rpn_head = RpnHead(RpnParam)
+    roi_extractor = RoiExtractor(RoiParam)
+    bbox_head = BboxHead(BboxParam)
+    detector = Detector()
+    if is_train:
+        train_sym = detector.get_train_symbol(backbone, neck, rpn_head, roi_extractor, bbox_head)
+        rpn_test_sym = None
+        test_sym = None
+    else:
+        train_sym = None
+        rpn_test_sym = None
+        test_sym = detector.get_test_symbol(backbone, neck, rpn_head, roi_extractor, bbox_head)
+
+
+    class ModelParam:
+        train_symbol = train_sym
+        test_symbol = test_sym
+        rpn_test_symbol = rpn_test_sym
+
+        from_scratch = False
+        random = True
+        memonger = False
+        memonger_until = "stage3_unit21_plus"
+
+        class pretrain:
+            prefix = "pretrain_model/resnet-v1-101"
+            epoch = 0
+            fixed_param = ["conv0", "stage1", "gamma", "beta"]
+
+
+    class OptimizeParam:
+        class optimizer:
+            type = "sgd"
+            lr = 0.005 / 8 * len(KvstoreParam.gpus) * KvstoreParam.batch_image
+            momentum = 0.9
+            wd = 0.0001
+            clip_gradient = None
+
+        class schedule:
+            begin_epoch = 0
+            end_epoch = 6
+            lr_iter = [60000 * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image),
+                       80000 * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image)]
+
+        class warmup:
+            type = "gradual"
+            lr = 0.005 / 8 * len(KvstoreParam.gpus) * KvstoreParam.batch_image / 3
+            iter = 500
+
+
+    class TestParam:
+        min_det_score = 0 # filter appended boxes
+        max_det_per_image = 100
+
+        process_roidb = lambda x: x
+        process_output = lambda x, y: x
+
+        class model:
+            prefix = "experiments/{}/checkpoint".format(General.name)
+            epoch = OptimizeParam.schedule.end_epoch
+
+        class nms:
+            type = "nms"
+            thr = 0.5
+
+        class coco:
+            annotation = "data/coco/annotations/instances_minival2014.json"
+
+    # data processing
+    class NormParam:
+        mean = (122.7717, 115.9465, 102.9801) # RGB order
+        std = (1.0, 1.0, 1.0)
+
+
+    class ResizeParam:
+        short = 800
+        long = 1333
+
+
+    class PadParam:
+        short = 800
+        long = 1333
+        max_num_gt = 100
+
+
+    class AnchorTarget2DParam:
+        def __init__(self):
+            self.generate = self._generate()
+            self.mean = (0.0, 0.0, 0.0, 0.0)
+            self.std = (1.0, 1.0, 1.0, 1.0)
+            self.class_agnostic = False
+
+        class _generate:
+            def __init__(self):
+                self.short = (100, 50, 25, 13, 7)
+                self.long = (167, 84, 42, 21, 11)
+                self.stride = (8, 16, 32, 64, 128)
+
+            scales = (4 * 2 ** (1.0 / 3.0),)
+            aspects = (1.0,)
+
+        class assign:
+            allowed_border = 9999
+            pos_thr = 0.4
+            neg_thr = 0.3
+            min_pos_thr = 0.0
+
+        class sample:
+            image_anchor = None
+            pos_fraction = None
+
+
+    class RenameParam:
+        mapping = dict(image="data")
+
+
+    from core.detection_input import ReadRoiRecord, Resize2DImageBbox, \
+        ConvertImageFromHwcToChw, Flip2DImageBbox, Pad2DImageBbox, \
+        RenameRecord
+    from models.retinanet.input import PyramidAnchorTarget2D, Norm2DImage
+
+    if is_train:
+        transform = [
+            ReadRoiRecord(None),
+            Norm2DImage(NormParam),
+            Resize2DImageBbox(ResizeParam),
+            Flip2DImageBbox(),
+            Pad2DImageBbox(PadParam),
+            ConvertImageFromHwcToChw(),
+            PyramidAnchorTarget2D(AnchorTarget2DParam()),
+            RenameRecord(RenameParam.mapping)
+        ]
+        data_name = ["data", "im_info", "gt_bbox"]
+        label_name = ["rpn_cls_label", "rpn_reg_target", "rpn_reg_weight"]
+    else:
+        transform = [
+            ReadRoiRecord(None),
+            Norm2DImage(NormParam),
+            Resize2DImageBbox(ResizeParam),
+            ConvertImageFromHwcToChw(),
+            RenameRecord(RenameParam.mapping)
+        ]
+        data_name = ["data", "im_info", "im_id", "rec_id"]
+        label_name = []
+
+    from models.retinanet import metric
+
+    rpn_acc_metric = metric.FGAccMetric(
+        "FGAcc",
+        ["cls_loss_output"],
+        ["rpn_cls_label"]
+    )
+
+    bbox_acc_metric = metric.FGAccMetric(
+        "BboxFGAcc",
+        ["align_cls_loss_output", "align_label_blockgrad_output"],
+        []
+    )
+
+    metric_list = [rpn_acc_metric, bbox_acc_metric]
+
+    return General, KvstoreParam, RpnParam, RoiParam, BboxParam, DatasetParam, \
+           ModelParam, OptimizeParam, TestParam, \
+           transform, data_name, label_name, metric_list
diff --git a/config/aligndet_r50v1_fpn_fp16_1x.py b/config/aligndet_r50v1_fpn_fp16_1x.py
new file mode 100644
index 0000000..f85715e
--- /dev/null
+++ b/config/aligndet_r50v1_fpn_fp16_1x.py
@@ -0,0 +1,292 @@
+from symbol.builder import FasterRcnn as Detector
+from models.retinanet.builder import MSRAResNet50V1FPN as Backbone
+from models.retinanet.builder import RetinaNetNeck as Neck
+from models.aligndet.builder import AlignRetinaNetHead as RpnHead
+from models.aligndet.builder import AlignRoiExtractor as RoiExtractor
+from models.aligndet.builder import AlignHead as BboxHead
+from mxnext.complicate import normalizer_factory
+
+
+def get_config(is_train):
+    class General:
+        log_frequency = 10
+        name = __name__.rsplit("/")[-1].rsplit(".")[-1]
+        batch_image = 2 if is_train else 1
+        fp16 = True
+
+
+    class KvstoreParam:
+        kvstore     = "local"
+        batch_image = General.batch_image
+        gpus        = [0, 1, 2, 3, 4, 5, 6, 7]
+        fp16        = General.fp16
+
+
+    class NormalizeParam:
+        normalizer = normalizer_factory(type="fixbn")
+
+
+    class BackboneParam:
+        fp16 = General.fp16
+        normalizer = NormalizeParam.normalizer
+
+
+    class NeckParam:
+        fp16 = General.fp16
+        normalizer = NormalizeParam.normalizer
+
+
+    class RpnParam:
+        fp16 = General.fp16
+        normalizer = NormalizeParam.normalizer
+        batch_image = General.batch_image
+        num_class   = 1 + 80
+        loss_weight = 1.0
+
+        class anchor_generate:
+            scale = (4 * 2 ** (1.0 / 3.0),)
+            ratio = (1.0,)
+            stride = (8, 16, 32, 64, 128)
+            short = (100, 50, 25, 13, 7)
+            long = (167, 84, 42, 21, 11)
+            image_anchor = None
+
+        class head:
+            conv_channel = 256
+            mean = (0.0, 0.0, 0.0, 0.0)
+            std = (1.0, 1.0, 1.0, 1.0)
+
+        class focal_loss:
+            alpha = 0.25
+            gamma = 2.0
+
+        class proposal:
+            pre_nms_top_n = 1000
+            min_bbox_side = 0
+            min_det_score = 0.05
+
+        class subsample_proposal:
+            pass
+
+        class bbox_target:
+            class_agnostic = False
+            mean = (0.0, 0.0, 0.0, 0.0)
+            std = (1.0, 1.0, 1.0, 1.0)
+            allowed_border = 9999
+            pos_thr = 0.7
+            neg_thr = 0.7
+            min_pos_thr = 0.0
+
+
+    class BboxParam:
+        fp16        = General.fp16
+        normalizer  = NormalizeParam.normalizer
+        num_class   = 1 + 80
+        loss_weight = 1.0
+
+        class anchor_generate:
+            scale = (4 * 2 ** (1.0 / 3.0),)
+            ratio = (1.0,)
+            stride = (8, 16, 32, 64, 128)
+
+        class head:
+            merge_score = False
+            num_conv = 2
+            use_1x1 = True
+            conv_channel = 1024
+            mean = (0.0, 0.0, 0.0, 0.0)
+            std = (1.0, 1.0, 1.0, 1.0)
+
+        class focal_loss:
+            alpha = 0.25
+            gamma = 2.0
+
+        class proposal:
+            pre_nms_top_n = 1000
+            min_bbox_side = 0
+            min_det_score = 0.05
+
+
+    class RoiParam:
+        fp16 = General.fp16
+        normalizer = NormalizeParam.normalizer
+        sample_bins = 7
+        im2col = True
+        stride = (8, 16, 32, 64, 128)
+        conv_channel = 256 * 7 * 7
+        scale = (4 * 2 ** (1.0 / 3.0),)
+        ratio = (1.0,)
+
+
+    class DatasetParam:
+        if is_train:
+            image_set = ("coco_train2014", "coco_valminusminival2014")
+        else:
+            image_set = ("coco_minival2014", )
+
+    backbone = Backbone(BackboneParam)
+    neck = Neck(NeckParam)
+    rpn_head = RpnHead(RpnParam)
+    roi_extractor = RoiExtractor(RoiParam)
+    bbox_head = BboxHead(BboxParam)
+    detector = Detector()
+    if is_train:
+        train_sym = detector.get_train_symbol(backbone, neck, rpn_head, roi_extractor, bbox_head)
+        rpn_test_sym = None
+        test_sym = None
+    else:
+        train_sym = None
+        rpn_test_sym = None
+        test_sym = detector.get_test_symbol(backbone, neck, rpn_head, roi_extractor, bbox_head)
+
+
+    class ModelParam:
+        train_symbol = train_sym
+        test_symbol = test_sym
+        rpn_test_symbol = rpn_test_sym
+
+        from_scratch = False
+        random = True
+        memonger = False
+        memonger_until = "stage3_unit21_plus"
+
+        class pretrain:
+            prefix = "pretrain_model/resnet-v1-50"
+            epoch = 0
+            fixed_param = ["conv0", "stage1", "gamma", "beta"]
+
+
+    class OptimizeParam:
+        class optimizer:
+            type = "sgd"
+            lr = 0.005 / 8 * len(KvstoreParam.gpus) * KvstoreParam.batch_image
+            momentum = 0.9
+            wd = 0.0001
+            clip_gradient = None
+
+        class schedule:
+            begin_epoch = 0
+            end_epoch = 6
+            lr_iter = [60000 * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image),
+                       80000 * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image)]
+
+        class warmup:
+            type = "gradual"
+            lr = 0.005 / 8 * len(KvstoreParam.gpus) * KvstoreParam.batch_image / 3
+            iter = 500
+
+
+    class TestParam:
+        min_det_score = 0 # filter appended boxes
+        max_det_per_image = 100
+
+        process_roidb = lambda x: x
+        process_output = lambda x, y: x
+
+        class model:
+            prefix = "experiments/{}/checkpoint".format(General.name)
+            epoch = OptimizeParam.schedule.end_epoch
+
+        class nms:
+            type = "nms"
+            thr = 0.5
+
+        class coco:
+            annotation = "data/coco/annotations/instances_minival2014.json"
+
+    # data processing
+    class NormParam:
+        mean = (122.7717, 115.9465, 102.9801) # RGB order
+        std = (1.0, 1.0, 1.0)
+
+
+    class ResizeParam:
+        short = 800
+        long = 1333
+
+
+    class PadParam:
+        short = 800
+        long = 1333
+        max_num_gt = 100
+
+
+    class AnchorTarget2DParam:
+        def __init__(self):
+            self.generate = self._generate()
+            self.mean = (0.0, 0.0, 0.0, 0.0)
+            self.std = (1.0, 1.0, 1.0, 1.0)
+            self.class_agnostic = False
+
+        class _generate:
+            def __init__(self):
+                self.short = (100, 50, 25, 13, 7)
+                self.long = (167, 84, 42, 21, 11)
+                self.stride = (8, 16, 32, 64, 128)
+
+            scales = (4 * 2 ** (1.0 / 3.0),)
+            aspects = (1.0,)
+
+        class assign:
+            allowed_border = 9999
+            pos_thr = 0.4
+            neg_thr = 0.3
+            min_pos_thr = 0.0
+
+        class sample:
+            image_anchor = None
+            pos_fraction = None
+
+
+    class RenameParam:
+        mapping = dict(image="data")
+
+
+    from core.detection_input import ReadRoiRecord, Resize2DImageBbox, \
+        ConvertImageFromHwcToChw, Flip2DImageBbox, Pad2DImageBbox, \
+        RenameRecord
+    from models.retinanet.input import PyramidAnchorTarget2D, Norm2DImage
+
+    if is_train:
+        transform = [
+            ReadRoiRecord(None),
+            Norm2DImage(NormParam),
+            Resize2DImageBbox(ResizeParam),
+            Flip2DImageBbox(),
+            Pad2DImageBbox(PadParam),
+            ConvertImageFromHwcToChw(),
+            PyramidAnchorTarget2D(AnchorTarget2DParam()),
+            RenameRecord(RenameParam.mapping)
+        ]
+        data_name = ["data", "im_info", "gt_bbox"]
+        label_name = ["rpn_cls_label", "rpn_reg_target", "rpn_reg_weight"]
+    else:
+        transform = [
+            ReadRoiRecord(None),
+            Norm2DImage(NormParam),
+            Resize2DImageBbox(ResizeParam),
+            ConvertImageFromHwcToChw(),
+            RenameRecord(RenameParam.mapping)
+        ]
+        data_name = ["data", "im_info", "im_id", "rec_id"]
+        label_name = []
+
+    from models.retinanet import metric
+
+    rpn_acc_metric = metric.FGAccMetric(
+        "FGAcc",
+        ["cls_loss_output"],
+        ["rpn_cls_label"]
+    )
+
+    bbox_acc_metric = metric.FGAccMetric(
+        "BboxFGAcc",
+        ["align_cls_loss_output", "align_label_blockgrad_output"],
+        []
+    )
+
+    metric_list = [rpn_acc_metric, bbox_acc_metric]
+
+    return General, KvstoreParam, RpnParam, RoiParam, BboxParam, DatasetParam, \
+           ModelParam, OptimizeParam, TestParam, \
+           transform, data_name, label_name, metric_list
diff --git a/config/aligndet_r50v1_fpn_ignorep3_fp16_1x.py b/config/aligndet_r50v1_fpn_ignorep3_fp16_1x.py
new file mode 100644
index 0000000..6e39672
--- /dev/null
+++ b/config/aligndet_r50v1_fpn_ignorep3_fp16_1x.py
@@ -0,0 +1,294 @@
+from symbol.builder import FasterRcnn as Detector
+from models.retinanet.builder import MSRAResNet50V1FPN as Backbone
+from models.retinanet.builder import RetinaNetNeck as Neck
+from models.aligndet.builder import AlignRetinaNetHead as RpnHead
+from models.aligndet.builder import AlignRoiExtractor as RoiExtractor
+from models.aligndet.builder import AlignHead as BboxHead
+from mxnext.complicate import normalizer_factory
+
+
+def get_config(is_train):
+    class General:
+        log_frequency = 10
+        name = __name__.rsplit("/")[-1].rsplit(".")[-1]
+        batch_image = 2 if is_train else 1
+        fp16 = True
+
+
+    class KvstoreParam:
+        kvstore     = "local"
+        batch_image = General.batch_image
+        gpus        = [0, 1, 2, 3, 4, 5, 6, 7]
+        fp16        = General.fp16
+
+
+    class NormalizeParam:
+        normalizer = normalizer_factory(type="fixbn")
+
+
+    class BackboneParam:
+        fp16 = General.fp16
+        normalizer = NormalizeParam.normalizer
+
+
+    class NeckParam:
+        fp16 = General.fp16
+        normalizer = NormalizeParam.normalizer
+
+
+    class RpnParam:
+        fp16 = General.fp16
+        normalizer = NormalizeParam.normalizer
+        batch_image = General.batch_image
+        num_class   = 1 + 80
+        loss_weight = 1.0
+
+        class anchor_generate:
+            scale = (4 * 2 ** (1.0 / 3.0),)
+            ratio = (1.0,)
+            stride = (8, 16, 32, 64, 128)
+            short = (100, 50, 25, 13, 7)
+            long = (167, 84, 42, 21, 11)
+            image_anchor = None
+
+        class head:
+            conv_channel = 256
+            mean = (0.0, 0.0, 0.0, 0.0)
+            std = (1.0, 1.0, 1.0, 1.0)
+
+        class focal_loss:
+            alpha = 0.25
+            gamma = 2.0
+
+        class proposal:
+            pre_nms_top_n = 1000
+            min_bbox_side = 0
+            min_det_score = 0.05
+
+        class subsample_proposal:
+            pass
+
+        class bbox_target:
+            class_agnostic = False
+            mean = (0.0, 0.0, 0.0, 0.0)
+            std = (1.0, 1.0, 1.0, 1.0)
+            allowed_border = 9999
+            pos_thr = 0.7
+            neg_thr = 0.7
+            min_pos_thr = 0.0
+
+
+    class BboxParam:
+        fp16        = General.fp16
+        normalizer  = NormalizeParam.normalizer
+        num_class   = 1 + 80
+        loss_weight = 1.0
+
+        class anchor_generate:
+            scale = (4 * 2 ** (1.0 / 3.0),)
+            ratio = (1.0,)
+            stride = (8, 16, 32, 64, 128)
+
+        class head:
+            merge_score = False
+            num_conv = 2
+            use_1x1 = True
+            conv_channel = 1024
+            mean = (0.0, 0.0, 0.0, 0.0)
+            std = (1.0, 1.0, 1.0, 1.0)
+            ignore_p3 = True
+
+        class focal_loss:
+            alpha = 0.25
+            gamma = 2.0
+
+        class proposal:
+            pre_nms_top_n = 1000
+            min_bbox_side = 0
+            min_det_score = 0.05
+
+
+    class RoiParam:
+        fp16 = General.fp16
+        normalizer = NormalizeParam.normalizer
+        sample_bins = 7
+        im2col = True
+        stride = (8, 16, 32, 64, 128)
+        conv_channel = 256 * 7 * 7
+        scale = (4 * 2 ** (1.0 / 3.0),)
+        ratio = (1.0,)
+        ignore_p3 = True
+
+
+    class DatasetParam:
+        if is_train:
+            image_set = ("coco_train2014", "coco_valminusminival2014")
+        else:
+            image_set = ("coco_minival2014", )
+
+    backbone = Backbone(BackboneParam)
+    neck = Neck(NeckParam)
+    rpn_head = RpnHead(RpnParam)
+    roi_extractor = RoiExtractor(RoiParam)
+    bbox_head = BboxHead(BboxParam)
+    detector = Detector()
+    if is_train:
+        train_sym = detector.get_train_symbol(backbone, neck, rpn_head, roi_extractor, bbox_head)
+        rpn_test_sym = None
+        test_sym = None
+    else:
+        train_sym = None
+        rpn_test_sym = None
+        test_sym = detector.get_test_symbol(backbone, neck, rpn_head, roi_extractor, bbox_head)
+
+
+    class ModelParam:
+        train_symbol = train_sym
+        test_symbol = test_sym
+        rpn_test_symbol = rpn_test_sym
+
+        from_scratch = False
+        random = True
+        memonger = False
+        memonger_until = "stage3_unit21_plus"
+
+        class pretrain:
+            prefix = "pretrain_model/resnet-v1-50"
+            epoch = 0
+            fixed_param = ["conv0", "stage1", "gamma", "beta"]
+
+
+    class OptimizeParam:
+        class optimizer:
+            type = "sgd"
+            lr = 0.005 / 8 * len(KvstoreParam.gpus) * KvstoreParam.batch_image
+            momentum = 0.9
+            wd = 0.0001
+            clip_gradient = None
+
+        class schedule:
+            begin_epoch = 0
+            end_epoch = 6
+            lr_iter = [60000 * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image),
+                       80000 * 16 // (len(KvstoreParam.gpus) * KvstoreParam.batch_image)]
+
+        class warmup:
+            type = "gradual"
+            lr = 0.005 / 8 * len(KvstoreParam.gpus) * KvstoreParam.batch_image / 3
+            iter = 500
+
+
+    class TestParam:
+        min_det_score = 0 # filter appended boxes
+        max_det_per_image = 100
+
+        process_roidb = lambda x: x
+        process_output = lambda x, y: x
+
+        class model:
+            prefix = "experiments/{}/checkpoint".format(General.name)
+            epoch = OptimizeParam.schedule.end_epoch
+
+        class nms:
+            type = "nms"
+            thr = 0.5
+
+        class coco:
+            annotation = "data/coco/annotations/instances_minival2014.json"
+
+    # data processing
+    class NormParam:
+        mean = (122.7717, 115.9465, 102.9801) # RGB order
+        std = (1.0, 1.0, 1.0)
+
+
+    class ResizeParam:
+        short = 800
+        long = 1333
+
+
+    class PadParam:
+        short = 800
+        long = 1333
+        max_num_gt = 100
+
+
+    class AnchorTarget2DParam:
+        def __init__(self):
+            self.generate = self._generate()
+            self.mean = (0.0, 0.0, 0.0, 0.0)
+            self.std = (1.0, 1.0, 1.0, 1.0)
+            self.class_agnostic = False
+
+        class _generate:
+            def __init__(self):
+                self.short = (100, 50, 25, 13, 7)
+                self.long = (167, 84, 42, 21, 11)
+                self.stride = (8, 16, 32, 64, 128)
+
+            scales = (4 * 2 ** (1.0 / 3.0),)
+            aspects = (1.0,)
+
+        class assign:
+            allowed_border = 9999
+            pos_thr = 0.4
+            neg_thr = 0.3
+            min_pos_thr = 0.0
+
+        class sample:
+            image_anchor = None
+            pos_fraction = None
+
+
+    class RenameParam:
+        mapping = dict(image="data")
+
+
+    from core.detection_input import ReadRoiRecord, Resize2DImageBbox, \
+        ConvertImageFromHwcToChw, Flip2DImageBbox, Pad2DImageBbox, \
+        RenameRecord
+    from models.retinanet.input import PyramidAnchorTarget2D, Norm2DImage
+
+    if is_train:
+        transform = [
+            ReadRoiRecord(None),
+            Norm2DImage(NormParam),
+            Resize2DImageBbox(ResizeParam),
+            Flip2DImageBbox(),
+            Pad2DImageBbox(PadParam),
+            ConvertImageFromHwcToChw(),
+            PyramidAnchorTarget2D(AnchorTarget2DParam()),
+            RenameRecord(RenameParam.mapping)
+        ]
+        data_name = ["data", "im_info", "gt_bbox"]
+        label_name = ["rpn_cls_label", "rpn_reg_target", "rpn_reg_weight"]
+    else:
+        transform = [
+            ReadRoiRecord(None),
+            Norm2DImage(NormParam),
+            Resize2DImageBbox(ResizeParam),
+            ConvertImageFromHwcToChw(),
+            RenameRecord(RenameParam.mapping)
+        ]
+        data_name = ["data", "im_info", "im_id", "rec_id"]
+        label_name = []
+
+    from models.retinanet import metric
+
+    rpn_acc_metric = metric.FGAccMetric(
+        "FGAcc",
+        ["cls_loss_output"],
+        ["rpn_cls_label"]
+    )
+
+    bbox_acc_metric = metric.FGAccMetric(
+        "BboxFGAcc",
+        ["align_cls_loss_output", "align_label_blockgrad_output"],
+        []
+    )
+
+    metric_list = [rpn_acc_metric, bbox_acc_metric]
+
+    return General, KvstoreParam, RpnParam, RoiParam, BboxParam, DatasetParam, \
+           ModelParam, OptimizeParam, TestParam, \
+           transform, data_name, label_name, metric_list
diff --git a/models/aligndet/__init__.py b/models/aligndet/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/aligndet/builder.py b/models/aligndet/builder.py
new file mode 100644
index 0000000..62c3f72
--- /dev/null
+++ b/models/aligndet/builder.py
@@ -0,0 +1,939 @@
+from __future__ import division
+from __future__ import print_function
+
+import mxnet as mx
+import mxnext as X
+import math
+
+from symbol.builder import BboxHead, RoiExtractor
+from models.retinanet.builder import RetinaNetHead
+
+
+class AlignRetinaNetHead(RetinaNetHead):
+    def __init__(self, pRpn):
+        super(AlignRetinaNetHead, self).__init__(pRpn)
+        self._proposal = None
+
+    def get_all_proposal(self, conv_feat, im_info):
+        if self._proposal is not None:
+            return self._proposal
+
+        p = self.p
+        batch_image = p.batch_image
+        stride = p.anchor_generate.stride
+        if not isinstance(stride, tuple):
+            stride = (stride, )
+        ratios = p.anchor_generate.ratio
+        scales = p.anchor_generate.scale
+        anchor_target_mean = p.head.mean
+        anchor_target_std = p.head.std
+        num_base_anchor = len(ratios) * len(scales)
+        pick_anchor = p.pick_anchor or False
+        nms = p.nms or False
+
+        cls_logit_dict, bbox_delta_dict = self.get_output(conv_feat)
+
+        proposal_dict, proposal_score_dict = dict(), dict()
+
+        for s in stride:
+            """
+            cls_prob: (N, A * C, H, W)
+            bbox_delta: (N, A * 4, H, W)
+            """
+            cls_prob = X.sigmoid(data=cls_logit_dict["stride%s" % s])
+            bbox_delta = bbox_delta_dict["stride%s" % s]
+
+            # (N, A * 4, H, W) -> (N, A, 4, H * W)
+            bbox_delta = X.reshape(
+                data=bbox_delta,
+                shape=(0, num_base_anchor, 4, -1),
+                name="bbox_delta_reshape_stride%s" % s
+            )
+            # (N, A, 4, H * W) -> (N, H * W, A, 4)
+            bbox_delta = X.transpose(
+                data=bbox_delta,
+                axes=(0, 3, 1, 2),
+                name="bbox_delta_reshape_transpose_stride%s" % s
+            )
+            # (N, H * W, A, 4) -> (N, H * W * A, 4)
+            bbox_delta = X.reshape(
+                data=bbox_delta,
+                shape=(0, -1, 4),
+                name="bbox_delta_reshape_transpose_reshape_stride%s" % s
+            )
+
+            anchor = mx.sym.contrib.GenAnchor(
+                cls_prob=bbox_delta_dict["stride%s" % s],
+                feature_stride=s,
+                scales=tuple(scales),
+                ratios=tuple(ratios),
+                name="anchors_stride%s" % s
+            )
+
+            # decode anchor
+            bbox_delta_list = mx.sym.split(bbox_delta, num_outputs=batch_image, axis=0, squeeze_axis=False)
+            im_info_list = mx.sym.split(im_info, num_outputs=batch_image, axis=0, squeeze_axis=False)
+            bbox_xyxy_list = list()
+            anchor_expand_dims = anchor.expand_dims(axis=0)
+            for bbox_delta_i, im_info_i in zip(bbox_delta_list, im_info_list):
+                pad_zero = mx.sym.zeros_like(bbox_delta_i)
+                bbox_delta_i = mx.sym.concat(pad_zero, bbox_delta_i, dim=-1)
+                bbox_xyxy_i = X.decode_bbox(
+                    rois=anchor_expand_dims,
+                    bbox_pred=bbox_delta_i,
+                    im_info=im_info_i,
+                    bbox_mean=anchor_target_mean,
+                    bbox_std=anchor_target_std,
+                    class_agnostic=True
+                )
+                bbox_xyxy_list.append(bbox_xyxy_i)
+            bbox_xyxy = mx.sym.concat(*bbox_xyxy_list, dim=0, name="proposal_stride%s_retina" % s)
+
+            proposal_dict["stride%s" % s] = bbox_xyxy
+            proposal_score_dict["stride%s" % s] = cls_prob
+
+        if pick_anchor:
+            for s in stride:
+                cls_score = proposal_score_dict["stride%s"%s]
+                bbox_xyxy = proposal_dict["stride%s"%s]
+
+                # (N, A * C, H, W) -> (N, H * W, A), C = 1
+                cls_score = cls_score.transpose((0, 2, 3, 1))
+                cls_score = cls_score.reshape((0, -3, 0))
+                # (N, H * W * A, 4) -> (N, H * W, A, 4)
+                bbox_xyxy = bbox_xyxy.reshape((0, -1, num_base_anchor, 4))
+
+                argmax_cls_score = cls_score.argmax(axis=2)
+                argmax_cls_score_stack = mx.sym.stack(*([argmax_cls_score] * 4), axis=2)
+
+                sample_cls_score = mx.sym.pick(cls_score, argmax_cls_score, axis=2)
+                sample_bbox_xyxy = mx.sym.pick(bbox_xyxy, argmax_cls_score_stack, axis=2)
+
+                # (N, H * W) -> (N, A * C, H * W), A = C = 1
+                sample_cls_score = sample_cls_score.reshape((0, 1, -1))
+
+                proposal_score_dict["stride%s"%s] = sample_cls_score
+                proposal_dict["stride%s"%s] = sample_bbox_xyxy
+        elif nms:
+            nms_thr = p.nms_thr
+
+            for s in stride:
+                cls_score = proposal_score_dict["stride%s"%s]
+                bbox_xyxy = proposal_dict["stride%s"%s]
+
+                # (N, A * C, H, W) -> (N, H * W, A, C)
+                cls_score = cls_score.reshape((0, 0, -1))
+                cls_score = cls_score.transpose((0, 2, 1))
+                cls_score = cls_score.reshape((0, 0, num_base_anchor, -1))
+                # (N, H * W, A, C) -> (N, H * W, A, 1)
+                cls_score = mx.sym.sum(cls_score, axis=3, keepdims=True)
+                proposal_score_dict["stride%s"%s] = cls_score
+                # (N, H * W * A, 4) -> (N, H * W, A, 4)
+                bbox_xyxy = bbox_xyxy.reshape((0, -1, num_base_anchor, 4))
+
+                (cls_score, bbox_xyxy) = mx.sym.contrib.InplaceNMS(
+                    cls_prob=cls_score,
+                    bbox_pred=bbox_xyxy,
+                    nms_thr=nms_thr,
+                    score_reset_value=0,
+                    bbox_reset_value=999999, # larger than allowed_border
+                    name="inplace_nms"
+                )
+
+                # (N, H * W, A, C) -> (N, A * C, H, W), C = 1
+                cls_score = cls_score.reshape((0, 0, -3))
+                cls_score = cls_score.transpose((0, 2, 1))
+                cls_score.reshape_like(proposal_score_dict["stride%s"%s])
+                # (N, H * W, A, 4) -> (N, H * W * A, 4)
+                bbox_xyxy = bbox_xyxy.reshape_like(proposal_dict["stride%s"%s])
+
+                proposal_dict["stride%s"%s] = bbox_xyxy
+                proposal_score_dict["stride%s"%s] = cls_score
+
+        proposal = (proposal_dict, proposal_score_dict)
+        self._proposal = proposal
+
+        return proposal
+
+    def get_sampled_proposal(self, conv_feat, gt_bbox, im_info):
+        p = self.p
+
+        mean = p.bbox_target.mean
+        std = p.bbox_target.std
+        class_agnostic = p.bbox_target.class_agnostic
+        short = p.anchor_generate.short
+        long = p.anchor_generate.long
+        stride = p.anchor_generate.stride
+        if not isinstance(stride, tuple):
+            stride = (stride, )
+        num_anchors = len(p.anchor_generate.scale) * len(p.anchor_generate.ratio)
+        allowed_border = p.bbox_target.allowed_border
+        pos_thr = p.bbox_target.pos_thr
+        neg_thr = p.bbox_target.neg_thr
+        min_pos_thr = p.bbox_target.min_pos_thr
+        pick_anchor = p.pick_anchor or False
+        if pick_anchor:
+            num_anchors = 1
+ 
+        (anchor_dict, anchor_score_dict) = self.get_all_proposal(conv_feat, im_info)
+
+        # custom op to encode new target
+        from models.aligndet import encode_anchor  # noqa: F401
+
+        (label, bbox_target, bbox_weight) = mx.sym.Custom(
+            op_type="encode_anchor",
+            gt_boxes=gt_bbox,
+            im_info=im_info,
+            short=short,
+            long=long,
+            stride=stride,
+            num_anchors=num_anchors,
+            class_agnostic=class_agnostic,
+            allowed_border=allowed_border,
+            pos_thr=pos_thr,
+            neg_thr=neg_thr,
+            min_pos_thr=min_pos_thr,
+            mean=mean,
+            std=std,
+            name="encode_anchor",
+            **anchor_dict
+        )
+
+        return anchor_dict, label, bbox_target, bbox_weight
+
+    def get_loss(self, conv_feat, cls_label, bbox_target, bbox_weight):
+        p = self.p
+        stride = p.anchor_generate.stride
+        if not isinstance(stride, tuple):
+            stride = (stride)
+        num_class = p.num_class
+        loss_weight = p.loss_weight
+        num_base_anchor = len(p.anchor_generate.ratio) * len(p.anchor_generate.scale)
+        scale_loss_shift = 128.0 if p.fp16 else 1.0
+        reg_only = p.reg_only or False
+
+        cls_logit_dict, bbox_delta_dict = self.get_output(conv_feat)
+        cls_logit_reshape_list = []
+        bbox_delta_reshape_list = []
+
+        # reshape logit and delta
+        for i, s in enumerate(stride):
+            # (N, A * C, H, W) -> (N, A, C, H * W)
+            cls_logit = X.reshape(
+                data=cls_logit_dict["stride%s" % s],
+                shape=(0, num_base_anchor, num_class-1, -1),
+                name="cls_stride%s_reshape" % s
+            )
+            # (N, A, C, H * W) -> (N, A, H * W, C)
+            cls_logit = X.transpose(
+                data=cls_logit,
+                axes=(0, 1, 3, 2),
+                name="cls_stride%s_transpose" % s
+            )
+            # (N, A, H * W, C) -> (N, A * H * W, C)
+            cls_logit = X.reshape(
+                data=cls_logit,
+                shape=(0, -3, 0),
+                name="cls_stride%s_transpose_reshape" % s
+            )
+
+            # (N, A * 4, H, W) -> (N, A * 4, H * W)
+            bbox_delta = X.reshape(
+                data=bbox_delta_dict["stride%s" % s],
+                shape=(0, 0, -1),
+                name="bbox_stride%s_reshape" % s
+            )
+
+            cls_logit_reshape_list.append(cls_logit)
+            bbox_delta_reshape_list.append(bbox_delta)
+
+        cls_logit_concat = X.concat(cls_logit_reshape_list, axis=1, name="bbox_logit_concat")
+        bbox_delta_concat = X.concat(bbox_delta_reshape_list, axis=2, name="bbox_delta_concat")
+
+        # classification loss
+        cls_loss = X.focal_loss(
+            data=cls_logit_concat,
+            label=cls_label,
+            normalization='valid',
+            alpha=p.focal_loss.alpha,
+            gamma=p.focal_loss.gamma,
+            grad_scale=0.0 if reg_only else 1.0 * loss_weight * scale_loss_shift,
+            workspace=1024,
+            name="cls_loss"
+        )
+
+        scalar = 0.11
+        # regression loss
+        bbox_norm = X.bbox_norm(
+            data=bbox_delta_concat - bbox_target,
+            label=cls_label,
+            name="bbox_norm"
+        )
+        bbox_loss = bbox_weight * X.smooth_l1(
+            data=bbox_norm,
+            scalar=math.sqrt(1/scalar),
+            name="bbox_loss"
+        )
+        reg_loss = X.make_loss(
+            data=bbox_loss,
+            grad_scale=1.0 * loss_weight * scale_loss_shift,
+            name="reg_loss"
+        )
+
+        return cls_loss, reg_loss
+
+
+class AlignHead(BboxHead):
+    def __init__(self, pBbox):
+        super(AlignHead, self).__init__(pBbox)
+
+        p = self.p
+        num_conv = p.head.num_conv
+        init = p.head.init or X.gauss(0.01)
+        stage = p.stage or ""
+        ignore_p3 = p.head.ignore_p3 or False
+
+        self.align_conv_weight = [X.var("align_conv_%d_weight%s" % (i + 1, stage), init=init) for i in range(num_conv)]
+        self.align_conv_bias = [X.var("align_conv_%d_bias%s" % (i + 1, stage), init=X.zero_init()) for i in range(num_conv)]
+
+        if ignore_p3:
+            self.align_conv_p3_weight = [X.var("align_conv_p3_%d_weight%s" % (i + 1, stage), init=init) for i in range(num_conv)]
+            self.align_conv_p3_bias = [X.var("align_conv_p3_%d_bias%s" % (i + 1, stage), init=init) for i in range(num_conv)]
+
+        self._head_feat_dict = None
+        self._cls_logit_dict = None
+        self._bbox_delta_dict = None
+
+        self.stage = stage
+
+    def _get_bbox_head_logit(self, conv_feat, conv_channel, ignore_p3=False):
+        p = self.p
+        num_conv = p.head.num_conv
+        use_1x1 = p.head.use_1x1 or False
+
+        for i in range(num_conv):
+            conv_feat = X.conv(
+                data=conv_feat,
+                kernel=1 if use_1x1 else 3,
+                filter=conv_channel,
+                weight=self.align_conv_p3_weight[i] if ignore_p3 else self.align_conv_weight[i],
+                bias=self.align_conv_p3_bias[i] if ignore_p3 else self.align_conv_bias[i],
+                no_bias=False,
+                name="align_conv_%d%s" % (i + 1, self.stage)
+            )
+            conv_feat = X.relu(conv_feat)
+
+        if p.fp16:
+            conv_feat = X.to_fp32(conv_feat, name="align_conv_fp32")
+
+        return conv_feat
+
+    def get_output(self, conv_feat):
+        if self._cls_logit_dict is not None and self._bbox_delta_dict is not None:
+            return self._cls_logit_dict, self._bbox_delta_dict
+
+        p = self.p
+        stride = p.anchor_generate.stride
+        if not isinstance(stride, tuple):
+            stride = (stride, )
+        conv_channel = p.head.conv_channel
+        num_base_anchor = len(p.anchor_generate.ratio) * len(p.anchor_generate.scale)
+        num_class = p.num_class
+        separate_predictor = p.head.separate_predictor or False
+        batch_image = p.batch_image
+        ignore_p3 = p.head.ignore_p3 or False
+
+        prior_prob = 0.01
+        pi = -math.log((1 - prior_prob) / prior_prob)
+
+        stage = self.stage
+        align_conv_cls_weight = X.var("align_conv_cls_weight%s" % stage, init=X.gauss(std=0.01))
+        align_conv_cls_bias = X.var("align_conv_cls_bias%s" % stage, init=X.constant(pi))
+        align_conv_bbox_weight = X.var("align_conv_bbox_weight%s" % stage, init=X.gauss(std=0.01))
+        align_conv_bbox_bias = X.var("align_conv_bbox_bias%s" % stage, init=X.zero_init())
+
+        head_feat_dict = {}
+        cls_logit_dict = {}
+        bbox_delta_dict = {}
+
+        for s in stride:
+            align_conv_relu = self._get_bbox_head_logit(
+                conv_feat=conv_feat["stride%s" % s],
+                conv_channel=conv_channel,
+                ignore_p3=ignore_p3 and s == 8
+            )
+
+            head_feat_dict["stride%s" % s] = align_conv_relu
+
+            if separate_predictor:
+                align_conv_relu = X.reshape(align_conv_relu, shape=(batch_image, -1, 0, 0))
+
+                cls_logit = X.conv(
+                    align_conv_relu,
+                    num_group=num_base_anchor,
+                    filter=num_base_anchor * (num_class - 1),
+                    no_bias=False,
+                    weight=align_conv_cls_weight,
+                    bias=align_conv_cls_bias,
+                    name="align_cls_score_stride%s" % s
+                )
+
+                bbox_delta = X.conv(
+                    align_conv_relu,
+                    num_group=num_base_anchor,
+                    filter=num_base_anchor * 4,
+                    no_bias=False,
+                    weight=align_conv_bbox_weight,
+                    bias=align_conv_bbox_bias,
+                    name="align_bbox_pred_stride%s" % s
+                )
+
+                cls_logit_dict["stride%s" % s] = cls_logit
+                bbox_delta_dict["stride%s" % s] = bbox_delta
+            else:
+                cls_logit = X.conv(
+                    align_conv_relu,
+                    filter=num_class - 1,
+                    no_bias=False,
+                    weight=align_conv_cls_weight,
+                    bias=align_conv_cls_bias,
+                    name="align_cls_score_stride%s" % s
+                )
+
+                bbox_delta = X.conv(
+                    align_conv_relu,
+                    filter=4,
+                    no_bias=False,
+                    weight=align_conv_bbox_weight,
+                    bias=align_conv_bbox_bias,
+                    name="align_bbox_pred_stride%s" % s
+                )
+
+                cls_logit_dict["stride%s" % s] = cls_logit.reshape(shape=(-1, num_base_anchor * (num_class - 1), 0, 0))
+                bbox_delta_dict["stride%s" % s] = bbox_delta.reshape(shape=(-1, num_base_anchor * 4, 0, 0))
+
+        self._head_feat_dict = head_feat_dict
+        self._cls_logit_dict = cls_logit_dict
+        self._bbox_delta_dict = bbox_delta_dict
+
+        return cls_logit_dict, bbox_delta_dict
+
+    def get_prediction(self, conv_feat, im_info, proposal):
+        p = self.p
+        merge_score = p.head.merge_score or False
+        use_prev_bbox = p.head.use_prev_bbox or False
+        use_prev_score = p.head.use_prev_score or False
+        assert not (merge_score and use_prev_score), "merge_score confilicts with use_prev_score"
+        stride = p.anchor_generate.stride
+        if not isinstance(stride, tuple):
+            stride = (stride, )
+        ratios = p.anchor_generate.ratio
+        scales = p.anchor_generate.scale
+        pre_nms_top_n = p.proposal.pre_nms_top_n
+        min_bbox_side = p.proposal.min_bbox_side
+        min_det_score = p.proposal.min_det_score
+        anchor_target_mean = p.head.mean
+        anchor_target_std = p.head.std
+        num_anchors = len(ratios) * len(scales)
+
+        cls_logit_dict, bbox_delta_dict = self.get_output(conv_feat)
+
+        bbox_xyxy_list = []
+        cls_score_list = []
+
+        proposal, proposal_score = proposal
+
+        for i, s in enumerate(stride):
+            cls_prob = X.sigmoid(data=cls_logit_dict["stride%s" % s])
+            bbox_delta = bbox_delta_dict["stride%s" % s]
+            anchors = proposal["stride%s" % s]
+            anchor_scores = proposal_score["stride%s" % s]
+
+            if merge_score:
+                cls_prob = cls_prob + anchor_scores
+
+            if use_prev_bbox:
+                bbox_delta = mx.sym.zeros_like(bbox_delta)
+
+            if use_prev_score:
+                cls_prob = anchor_scores
+
+            thresh_level = 0 if s == max(stride) else min_det_score
+            bbox_xyxy, cls_score = mx.sym.contrib.GenProposalRetina(
+                cls_prob=cls_prob,
+                bbox_pred=bbox_delta,
+                im_info=im_info,
+                anchors=anchors,
+                feature_stride=s,
+                anchor_mean=anchor_target_mean,
+                anchor_std=anchor_target_std,
+                num_anchors=num_anchors,
+                rpn_pre_nms_top_n=pre_nms_top_n,
+                rpn_min_size=min_bbox_side,
+                thresh=thresh_level,
+                batch_wise_anchor=True,
+                workspace=512,
+                name="proposal_pre_nms_stride%s" % s
+            )
+
+            bbox_xyxy_list.append(bbox_xyxy)
+            cls_score_list.append(cls_score)
+
+        bbox_xyxy = X.concat(bbox_xyxy_list, axis=1, name="align_bbox_xyxy_concat")
+        cls_score = X.concat(cls_score_list, axis=1, name="align_cls_score_concat")
+
+        return cls_score, bbox_xyxy
+
+    def get_all_proposal(self, conv_feat, im_info, anchor_dict):
+        """
+        anchors are the bboxes and scores of the previous stage, not of this stage.
+        """
+
+        p = self.p
+        batch_image = p.batch_image
+        stride = p.anchor_generate.stride
+        if not isinstance(stride, tuple):
+            stride = (stride, )
+        anchor_target_mean = p.head.mean
+        anchor_target_std = p.head.std
+        ratios = p.anchor_generate.ratio
+        scales = p.anchor_generate.scale
+        num_base_anchor = len(ratios) * len(scales)
+
+        cls_logit_dict, bbox_delta_dict = self.get_output(conv_feat)
+
+        proposal_dict, proposal_score_dict = dict(), dict()
+
+        for i, s in enumerate(stride):
+            cls_prob = X.sigmoid(data=cls_logit_dict["stride%s" % s])
+            bbox_delta = bbox_delta_dict["stride%s" % s]
+            anchor = anchor_dict["stride%s" % s]
+
+            # (N, A * 4, H, W) -> (N, A, 4, H * W)
+            bbox_delta = X.reshape(
+                data=bbox_delta,
+                shape=(0, num_base_anchor, 4, -1),
+                name="bbox_delta_reshape_stride%s" % s
+            )
+            # (N, A, 4, H * W) -> (N, H * W, A, 4)
+            bbox_delta = X.transpose(
+                data=bbox_delta,
+                axes=(0, 3, 1, 2),
+                name="bbox_delta_reshape_transpose_stride%s" % s
+            )
+            # (N, H * W, A, 4) -> (N, H * W * A, 4)
+            bbox_delta = X.reshape(
+                data=bbox_delta,
+                shape=(0, -1, 4),
+                name="bbox_delta_reshape_transpose_reshape_stride%s" % s
+            )
+
+            # decode anchor
+            bbox_delta_list = mx.sym.split(bbox_delta, num_outputs=batch_image, axis=0, squeeze_axis=False)
+            anchor_list = mx.sym.split(anchor, num_outputs=batch_image, axis=0, squeeze_axis=False)
+            im_info_list = mx.sym.split(im_info, num_outputs=batch_image, axis=0, squeeze_axis=False)
+            bbox_xyxy_list = list()
+            for bbox_delta_i, anchor_i, im_info_i in zip(bbox_delta_list, anchor_list, im_info_list):
+                pad_zero = mx.sym.zeros_like(bbox_delta_i)
+                bbox_delta_i = mx.sym.concat(pad_zero, bbox_delta_i, dim=-1)
+                bbox_xyxy_i = X.decode_bbox(
+                    rois=anchor_i,
+                    bbox_pred=bbox_delta_i,
+                    im_info=im_info_i,
+                    bbox_mean=anchor_target_mean,
+                    bbox_std=anchor_target_std,
+                    class_agnostic=True
+                )
+                bbox_xyxy_list.append(bbox_xyxy_i)
+            bbox_xyxy = mx.sym.concat(*bbox_xyxy_list, dim=0, name="proposal_stride%s_%s" % (s, self.stage))
+
+            proposal_dict["stride%s" % s] = bbox_xyxy
+            proposal_score_dict["stride%s" % s] = cls_prob
+
+        return proposal_dict, proposal_score_dict
+
+    def get_proposal_and_label(self, conv_feat, gt_bbox, im_info, prev_anchor):
+        """
+        Get proposal of this head with the proposal from the previous head.
+        Use proposal of this head to encode the training target of the next head.
+        """
+        p = self.p
+
+        mean = p.bbox_target.mean
+        std = p.bbox_target.std
+        class_agnostic = p.bbox_target.class_agnostic
+        short = p.anchor_generate.short
+        long = p.anchor_generate.long
+        stride = p.anchor_generate.stride
+        if not isinstance(stride, tuple):
+            stride = (stride, )
+        num_anchors = len(p.anchor_generate.scale) * len(p.anchor_generate.ratio)
+        allowed_border = p.bbox_target.allowed_border
+        pos_thr = p.bbox_target.pos_thr
+        neg_thr = p.bbox_target.neg_thr
+        min_pos_thr = p.bbox_target.min_pos_thr
+
+        anchor_dict, anchor_score_dict = self.get_all_proposal(conv_feat, im_info, prev_anchor)
+
+        # custom op to encode new target
+        from models.aligndet import encode_anchor  # noqa: F401
+
+        (label, bbox_target, bbox_weight) = mx.sym.Custom(
+            op_type="encode_anchor",
+            gt_boxes=gt_bbox,
+            im_info=im_info,
+            short=short,
+            long=long,
+            stride=stride,
+            num_anchors=num_anchors,
+            class_agnostic=class_agnostic,
+            allowed_border=allowed_border,
+            pos_thr=pos_thr,
+            neg_thr=neg_thr,
+            min_pos_thr=min_pos_thr,
+            mean=mean,
+            std=std,
+            name="encode_anchor",
+            **anchor_dict
+        )
+
+        return anchor_dict, label, bbox_target, bbox_weight
+
+    def get_loss(self, conv_feat, cls_label, bbox_target, bbox_weight):
+        p = self.p
+        stride = p.anchor_generate.stride
+        if not isinstance(stride, tuple):
+            stride = (stride, )
+        num_class = p.num_class
+        loss_weight = p.loss_weight
+        num_base_anchor = len(p.anchor_generate.ratio) * len(p.anchor_generate.scale)
+        stage = self.stage
+
+        cls_logit_dict, bbox_delta_dict = self.get_output(conv_feat)
+        cls_logit_reshape_list = []
+        bbox_delta_reshape_list = []
+
+        scale_loss_shift = 128.0 if p.fp16 else 1.0
+
+        # reshape logit and delta
+        for i, s in enumerate(stride):
+            # (N, A * C, H, W) -> (N, A, C, H * W)
+            cls_logit = X.reshape(
+                data=cls_logit_dict["stride%s" % s],
+                shape=(0, num_base_anchor, num_class - 1, -1),
+                name="align_cls_stride%s_reshape" % s
+            )
+            # (N, A, C, H * W) -> (N, A, H * W, C)
+            cls_logit = X.transpose(
+                data=cls_logit,
+                axes=(0, 1, 3, 2),
+                name="align_cls_stride%s_transpose" % s
+            )
+            # (N, A, H * W, C) -> (N, A * H * W, C)
+            cls_logit = X.reshape(
+                data=cls_logit,
+                shape=(0, -3, 0),
+                name="align_cls_stride%s_transpose_reshape" % s
+            )
+
+            # (N, A * 4, H, W) -> (N, A * 4, H * W)
+            bbox_delta = X.reshape(
+                data=bbox_delta_dict["stride%s" % s],
+                shape=(0, 0, -1),
+                name="align_bbox_stride%s_reshape" % s
+            )
+
+            cls_logit_reshape_list.append(cls_logit)
+            bbox_delta_reshape_list.append(bbox_delta)
+
+        cls_logit_concat = X.concat(
+            cls_logit_reshape_list,
+            axis=1,
+            name="align_bbox_logit_concat"
+        )
+        bbox_delta_concat = X.concat(
+            bbox_delta_reshape_list,
+            axis=2,
+            name="align_bbox_delta_concat"
+        )
+
+        # classification loss
+        cls_loss = X.focal_loss(
+            data=cls_logit_concat,
+            label=cls_label,
+            normalization='valid',
+            alpha=p.focal_loss.alpha,
+            gamma=p.focal_loss.gamma,
+            grad_scale=1.0 * loss_weight * scale_loss_shift,
+            workspace=1024,
+            name="align_cls_loss%s" % stage
+        )
+
+        scalar = 0.11
+        # regression loss
+        bbox_norm = X.bbox_norm(
+            data=bbox_delta_concat - bbox_target,
+            label=cls_label,
+            name="align_bbox_norm"
+        )
+        bbox_loss = bbox_weight * X.smooth_l1(
+            data=bbox_norm,
+            scalar=math.sqrt(1 / scalar),
+            name="align_bbox_loss"
+        )
+        reg_loss = X.make_loss(
+            data=bbox_loss,
+            grad_scale=1.0 * loss_weight * scale_loss_shift,
+            name="align_reg_loss%s" % stage
+        )
+
+        cls_label = X.block_grad(cls_label, name="align_label_blockgrad%s" % stage)
+
+        return cls_loss, reg_loss, cls_label
+
+
+class AlignRoiExtractor(RoiExtractor):
+    def __init__(self, pRoi):
+        super(AlignRoiExtractor, self).__init__(pRoi)
+
+    def get_roi_feature(self, feat_dict, anchor_dict):
+        p = self.p
+        stride = p.stride
+        if not isinstance(stride, tuple):
+            stride = (stride, )
+        conv_channel = p.conv_channel
+        ratios = p.ratio
+        scales = p.scale
+        num_anchors = len(ratios) * len(scales)
+        sample_bins = p.sample_bins
+        stage = p.stage or ""
+        im2col = p.im2col or False
+        conv3d = p.conv3d or False
+        roialign = p.roialign or False
+        ignore_p3 = p.ignore_p3 or False
+        gauss_init = p.gauss_init or False
+        guided_anchor = p.guided_anchor or False
+        learned_offset = p.learned_offset or False
+        assert not (guided_anchor and learned_offset)
+
+        if p.fp16:
+            for s in stride:
+                feat_dict["stride%s" % s] = X.to_fp32(
+                    feat_dict["stride%s" % s],
+                    name="feat_stride%s_to_fp32%s" % (s, stage)
+                )
+
+        anchor_feat_dict = {}
+
+        for s in stride:
+            if ignore_p3 and s == 8:
+                old_sample_bins = sample_bins
+                old_conv_channel = conv_channel
+                sample_bins = 3
+                conv_channel = old_conv_channel // (old_sample_bins ** 2) * (sample_bins ** 2)
+
+            if guided_anchor:
+                # (N, H * W * A, 4) -> (N, A * K * K * 2, H, W)
+                (x1y1, x2y2) = mx.sym.split(anchor_dict["stride%s" % s], num_outputs=2, axis=-1)
+                hw = x2y2 - x1y1  # (N, H * W * A, 2)
+                hw = X.reshape(hw, [0, -1, 2 * num_anchors])  # (N, H * W, A * 2)
+                hw = X.transpose(hw, [0, 2, 1])  # (N, A * 2, H * W)
+                hw = mx.sym.reshape_like(hw, feat_dict["stride%s" % s], lhs_begin=2, lhs_end=None, rhs_begin=2, rhs_end=None)  # (N, A * 2, H, W)
+                # normalize input hw
+                hw = mx.sym.BatchNorm(hw, fix_gamma=False, use_global_stats=False, name="guided_anchor_bn_stride%s" % s)
+                offset = X.conv(hw, name="guided_anchor_offset_stride%s" % s, filter=2 * num_anchors * sample_bins ** 2, init=X.gauss(0.01))
+            elif learned_offset:
+                offset = X.conv(feat_dict["stride%s" % s], name="learned_offset_stride%s" % s, filter=2 * num_anchors * sample_bins ** 2, init=X.gauss(0.01))
+            else:
+                # (N, H * W * A, 4) -> (N, A * K * K * 2, H, W)
+                offset = mx.sym.contrib.GetAnchorOffset(
+                    data=feat_dict["stride%s" % s],
+                    anchor=anchor_dict["stride%s" % s],
+                    kernel=(sample_bins, sample_bins),
+                    stride=s,
+                    name="get_anchor_offset_stride%s%s" % (s, stage)
+                )
+
+            anchor_feat_list = []
+            for anchor_idx in range(num_anchors):
+                offset_i = mx.sym.slice_axis(
+                    offset,
+                    begin=anchor_idx * (sample_bins * sample_bins * 2),
+                    end=(anchor_idx + 1) * (sample_bins * sample_bins * 2),
+                    axis=1,
+                    name="anchor_offset_stride%s_slice%s%s" % (s, anchor_idx, stage)
+                )
+                if im2col:
+                    anchor_feat_i = mx.sym.contrib.DeformableConvolutionIm2Col(
+                        data=feat_dict["stride%s" % s],
+                        offset=offset_i,
+                        kernel=(sample_bins, sample_bins),
+                        pad=(sample_bins // 2, sample_bins // 2),
+                        num_filter=conv_channel,
+                        name="deform_im2col_stride%s_slice%s%s" % (s, anchor_idx, stage)
+                    )
+
+                else:
+                    if gauss_init:
+                        w = mx.sym.var("deform_conv_stride%s_slice%s%s_weight" % (s, anchor_idx, stage), init=X.gauss(0.01))
+                    else:
+                        w = None
+                    anchor_feat_i = mx.sym.contrib.DeformableConvolution(
+                        data=feat_dict["stride%s" % s],
+                        offset=offset_i,
+                        weight=w,
+                        kernel=(sample_bins, sample_bins),
+                        pad=(sample_bins // 2, sample_bins // 2),
+                        num_filter=conv_channel,
+                        num_deformable_group=1,
+                        no_bias=False,
+                        name="deform_conv_stride%s_slice%s%s" % (s, anchor_idx, stage)
+                    )
+                anchor_feat_list.append(anchor_feat_i)
+            anchor_feat = X.concat(
+                anchor_feat_list,
+                axis=1,
+                name="anchor_feat_concat_stride%s%s" % (s, stage)
+            )
+
+            anchor_feat = X.reshape(anchor_feat, shape=(-1, conv_channel, 0, 0))
+
+            if roialign:
+                # (N, H * W * A, 4) -> (N, H * W * A, C, bin, bin)
+                feat = feat_dict["stride%s" % s]
+                anchor = anchor_dict["stride%s" % s]
+                anchor_feat = X.roi_align(feat, anchor, out_size=sample_bins, stride=s, name="roialign_stride%s" % s)
+                anchor_feat = X.reshape(anchor_feat, [0, 0, -1])
+                anchor_feat = X.transpose(anchor_feat, [0, 2, 1])
+                anchor_feat = mx.sym.reshape_like(anchor_feat, feat, lhs_begin=2, lhs_end=None, rhs_begin=2, rhs_end=None)
+
+            anchor_feat_dict["stride%s" % s] = anchor_feat
+
+            if ignore_p3 and s == 8:
+                sample_bins = old_sample_bins
+                conv_channel = old_conv_channel
+
+        if p.fp16:
+            for s in stride:
+                anchor_feat_dict["stride%s" % s] = X.to_fp16(
+                    anchor_feat_dict["stride%s" % s],
+                    name="anchor_feat_stride%s_to_fp16%s" % (s, stage)
+                )
+
+        return anchor_feat_dict
+
+    def get_roi_feature_test(self, feat_dict, anchor_dict):
+        return self.get_roi_feature(feat_dict, anchor_dict)
+
+
+class FlatRoiExtractor(RoiExtractor):
+    def __init__(self, pRoi):
+        super(FlatRoiExtractor, self).__init__(pRoi)
+
+    def get_roi_feature(self, feat_dict, anchor_dict):
+        return feat_dict
+
+    def get_roi_feature_test(self, feat_dict, anchor_dict):
+        return self.get_roi_feature(feat_dict, anchor_dict)
+
+
+class CascadeRcnn(object):
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def get_train_symbol(backbone, neck, rpn_head, roi_extractor: AlignRoiExtractor, roi_extractor_2nd: AlignRoiExtractor, bbox_head: AlignHead, bbox_head_2nd: AlignHead, share_feat=False):
+        gt_bbox = X.var("gt_bbox")
+        im_info = X.var("im_info")
+        rpn_cls_label = X.var("rpn_cls_label")
+        rpn_reg_target = X.var("rpn_reg_target")
+        rpn_reg_weight = X.var("rpn_reg_weight")
+
+        rpn_feat = backbone.get_rpn_feature()
+        rcnn_feat = backbone.get_rcnn_feature()
+        rpn_feat = neck.get_rpn_feature(rpn_feat)
+        rcnn_feat = neck.get_rcnn_feature(rcnn_feat)
+
+        rpn_loss = rpn_head.get_loss(rpn_feat, rpn_cls_label, rpn_reg_target, rpn_reg_weight)
+
+        # stage1
+        proposal, bbox_cls, bbox_target, bbox_weight = \
+            rpn_head.get_sampled_proposal(
+                rpn_feat,
+                gt_bbox,
+                im_info
+            )
+        roi_feat = roi_extractor.get_roi_feature(rcnn_feat, proposal)
+        bbox_loss = bbox_head.get_loss(
+            roi_feat,
+            bbox_cls,
+            bbox_target,
+            bbox_weight
+        )
+
+        # stage2
+        # though call get_sampled_proposal, bbox_head does not sample rois
+        proposal_2nd, bbox_cls_2nd, bbox_target_2nd, bbox_weight_2nd = \
+            bbox_head.get_proposal_and_label(
+                roi_feat,
+                gt_bbox,
+                im_info,
+                proposal,
+            )
+        if share_feat:
+            feat = rcnn_feat
+        else:
+            feat = bbox_head._head_feat_dict
+        roi_feat_2nd = roi_extractor_2nd.get_roi_feature(feat, proposal_2nd)
+        bbox_loss_2nd = bbox_head_2nd.get_loss(
+            roi_feat_2nd,
+            bbox_cls_2nd,
+            bbox_target_2nd,
+            bbox_weight_2nd
+        )
+
+        return X.group(rpn_loss + bbox_loss + bbox_loss_2nd)
+
+    @staticmethod
+    def get_test_symbol(backbone, neck, rpn_head: AlignRetinaNetHead, roi_extractor: AlignRoiExtractor, roi_extractor_2nd: AlignRoiExtractor, bbox_head: AlignHead, bbox_head_2nd: AlignHead, stage=3, share_feat=False):
+        im_info = X.var("im_info")
+        im_id = X.var("im_id")
+        rec_id = X.var("rec_id")
+
+        rpn_feat = backbone.get_rpn_feature()
+        rcnn_feat = backbone.get_rcnn_feature()
+        rpn_feat = neck.get_rpn_feature(rpn_feat)
+        rcnn_feat = neck.get_rcnn_feature(rcnn_feat)
+
+        if stage == 1:
+            cls_score, bbox_xyxy = rpn_head.get_prediction(rpn_feat, im_info)
+        elif stage == 2:
+            proposal, proposal_score = rpn_head.get_all_proposal(rpn_feat, im_info)
+            roi_feat = roi_extractor.get_roi_feature(rcnn_feat, proposal)
+            cls_score, bbox_xyxy = bbox_head.get_prediction(
+                roi_feat,
+                im_info,
+                (proposal, proposal_score)
+            )
+        elif stage == 3:
+            proposal, proposal_score = rpn_head.get_all_proposal(rpn_feat, im_info)
+            roi_feat = roi_extractor.get_roi_feature(rcnn_feat, proposal)
+            proposal_2nd, proposal_score_2nd = bbox_head.get_all_proposal(
+                roi_feat,
+                im_info,
+                proposal
+            )
+            if share_feat:
+                feat = rcnn_feat
+            else:
+                feat = bbox_head._head_feat_dict
+            roi_feat_2nd = roi_extractor_2nd.get_roi_feature(feat, proposal_2nd)
+            cls_score, bbox_xyxy = bbox_head_2nd.get_prediction(
+                roi_feat_2nd,
+                im_info,
+                (proposal_2nd, proposal_score_2nd)
+            )
+        else:
+            raise ValueError("No more stages")
+
+        return X.group([rec_id, im_id, im_info, cls_score, bbox_xyxy])
diff --git a/models/aligndet/encode_anchor.py b/models/aligndet/encode_anchor.py
new file mode 100644
index 0000000..07dabe8
--- /dev/null
+++ b/models/aligndet/encode_anchor.py
@@ -0,0 +1,185 @@
+"""
+Encode boxes for anchors w.r.t matching gt_boxes
+author: Chenxia Han
+
+input:
+    anchor:   (N, H * W * A, 4)
+    gt_boxes: (N, MAX_NUM_GT, 5)
+    im_info:  (N, 3)
+output:
+    label:   (N, \sum{A * H * W})
+    target:  (N, A * 4, \sum{H * W})
+    weight:  (N, A * 4, \sum{H * W})
+"""
+
+import mxnet as mx
+import numpy as np
+
+from models.aligndet.input import AlignPyramidAnchorTarget2D
+
+class AnchorTarget2DParam:
+    def __init__(self, short, long, stride, mean, std, class_agnostic,
+                 all_anchor_list, allowed_border, pos_thr, neg_thr, min_pos_thr):
+        self.mean = mean
+        self.std = std
+        self.class_agnostic = class_agnostic
+
+        # input anchor
+        self.all_anchor_list = all_anchor_list
+
+        self.generate.short = tuple(short)
+        self.generate.long = tuple(long)
+        self.generate.stride = tuple(stride)
+        self.assign.allowed_border = allowed_border
+        self.assign.pos_thr = pos_thr
+        self.assign.neg_thr = neg_thr
+        self.assign.min_pos_thr = min_pos_thr
+
+    class generate:
+        short = None
+        long = None
+        stride = None
+
+    class assign:
+        allowed_border = None
+        pos_thr = None
+        neg_thr = None
+        min_pos_thr = None
+
+
+class EncodeAnchorOperator(mx.operator.CustomOp):
+    def __init__(self, short, long, stride, mean, std, class_agnostic,
+                 allowed_border, pos_thr, neg_thr, min_pos_thr):
+        self.short = short
+        self.long = long
+        self.stride = stride
+        self.mean = mean
+        self.std = std
+        self.class_agnostic = class_agnostic
+        self.allowed_border = allowed_border
+        self.pos_thr = pos_thr
+        self.neg_thr = neg_thr
+        self.min_pos_thr = min_pos_thr
+
+    def forward(self, is_train, req, in_data, out_data, aux):
+        anchor_list = in_data[:-2]
+        gt_boxes = in_data[-2]
+        im_info = in_data[-1]
+
+        nbatch = in_data[0].shape[0]
+
+        short = self.short
+        long = self.long
+        stride = self.stride
+        mean = self.mean
+        std = self.std
+        class_agnostic = self.class_agnostic
+        allowed_border = self.allowed_border
+        pos_thr = self.pos_thr
+        neg_thr = self.neg_thr
+        min_pos_thr = self.min_pos_thr
+
+        label_list = []
+        target_list = []
+        weight_list = []
+
+        for i in range(nbatch):
+            anchor_list_np = []
+            for anchor in anchor_list:
+                anchor_list_np.append(anchor[i].asnumpy())
+
+            anchor_param = AnchorTarget2DParam(
+                short,
+                long,
+                stride,
+                mean,
+                std,
+                class_agnostic,
+                anchor_list_np,
+                allowed_border,
+                pos_thr,
+                neg_thr,
+                min_pos_thr
+            )
+            anchor_target_2d = AlignPyramidAnchorTarget2D(anchor_param)
+            input_record = {"im_info": im_info[i].asnumpy(), "gt_bbox": gt_boxes[i].asnumpy()}
+            anchor_target_2d.apply(input_record)
+
+            label_list.append(mx.nd.array(input_record["rpn_cls_label"]))
+            target_list.append(mx.nd.array(input_record["rpn_reg_target"]))
+            weight_list.append(mx.nd.array(input_record["rpn_reg_weight"]))
+
+        label = mx.nd.stack(*label_list, axis=0)
+        target = mx.nd.stack(*target_list, axis=0)
+        weight = mx.nd.stack(*weight_list, axis=0)
+
+        self.assign(out_data[0], req[0], label)
+        self.assign(out_data[1], req[1], target)
+        self.assign(out_data[2], req[2], weight)
+
+    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+        num_input = len(in_data)
+        for i in range(num_input):
+            self.assign(in_grad[i], req[i], 0)
+
+
+@mx.operator.register("encode_anchor")
+class EncodeAnchorProp(mx.operator.CustomOpProp):
+    def __init__(self, short, long, stride, num_anchors, class_agnostic,
+                 allowed_border, pos_thr, neg_thr, min_pos_thr,
+                 mean="(0, 0, 0, 0)", std="(1, 1, 1, 1)"):
+        super(EncodeAnchorProp, self).__init__(need_top_grad=False)
+        self.short = eval(short)
+        self.long = eval(long)
+        self.stride = eval(stride)
+        self.num_anchors = eval(num_anchors)
+        self.class_agnostic = eval(class_agnostic)
+        self.allowed_border = int(allowed_border)
+        self.pos_thr = float(pos_thr)
+        self.neg_thr = float(neg_thr)
+        self.min_pos_thr = float(min_pos_thr)
+        self.mean = eval(mean)
+        self.std = eval(std)
+
+    def list_arguments(self):
+        args_list = []
+        for s in self.stride:
+            args_list.append("stride%s" % s)
+        args_list += ["gt_boxes", "im_info"]
+
+        return args_list
+
+    def list_outputs(self):
+        return ["label", "target", "weight"]
+
+    def infer_shape(self, in_shape):
+        anchor_shape_list = in_shape[:-2]
+        gt_boxes_shape = in_shape[-2]
+        im_info_shape = in_shape[-1]
+
+        nbatch = im_info_shape[0]
+
+        assert(anchor_shape_list[0][2] == 4)
+        assert(gt_boxes_shape[2] == 5)
+        assert(im_info_shape[1] == 3)
+        assert(anchor_shape_list[0][0] == nbatch)
+        assert(gt_boxes_shape[0] == nbatch)
+
+        num_anchors = self.num_anchors
+        total_anchors = np.sum([x[1] for x in anchor_shape_list])
+
+        label_shape = (nbatch, total_anchors)
+        target_shape = (nbatch, num_anchors * 4, total_anchors // num_anchors)
+        weight_shape = (nbatch, num_anchors * 4, total_anchors // num_anchors)
+
+        return anchor_shape_list + [gt_boxes_shape, im_info_shape], \
+               [label_shape, target_shape, weight_shape]
+
+    def create_operator(self, ctx, shapes, dtypes):
+        return EncodeAnchorOperator(self.short, self.long, self.stride,
+                                    self.mean, self.std, self.class_agnostic,
+                                    self.allowed_border, self.pos_thr,
+                                    self.neg_thr, self.min_pos_thr)
+
+    def declare_backward_dependency(self, out_grad, in_data, out_data):
+        return []
diff --git a/models/aligndet/input.py b/models/aligndet/input.py
new file mode 100644
index 0000000..e53a46d
--- /dev/null
+++ b/models/aligndet/input.py
@@ -0,0 +1,92 @@
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from models.retinanet.input import PyramidAnchorTarget2DBase
+
+class AlignPyramidAnchorTarget2D(PyramidAnchorTarget2DBase):
+    """
+    input: image_meta: tuple(h, w, scale)
+           gt_bbox, ndarry(max_num_gt, 4)
+    output: anchor_label, ndarray(num_anchor * h * w)
+            anchor_bbox_target, ndarray(num_anchor * 4, h * w)
+            anchor_bbox_weight, ndarray(num_anchor * 4, h * w)
+    """
+
+    def __init__(self, pAnchor):
+        super(AlignPyramidAnchorTarget2D, self).__init__(pAnchor)
+
+        self.pyramid_levels = len(self.p.generate.stride)
+
+        self.anchor_target_2d = PyramidAnchorTarget2DBase(self.p)
+        self.all_anchor_list = self.p.all_anchor_list
+
+        self.anchor_target_2d.v_all_anchor = self.v_all_anchor
+        self.anchor_target_2d.h_all_anchor = self.h_all_anchor
+
+    @property
+    def v_all_anchor(self):
+        anchors = np.concatenate(self.all_anchor_list)
+        return anchors
+
+    @property
+    def h_all_anchor(self):
+        anchors = np.concatenate(self.all_anchor_list)
+        return anchors
+
+    def apply(self, input_record):
+        anchor_size = [0] + [x.shape[0] for x in self.all_anchor_list]
+        anchor_size = np.cumsum(anchor_size)
+        cls_label, reg_target, reg_weight = \
+            self.anchor_target_2d.apply(input_record)
+
+        im_info = input_record["im_info"]
+        h, w = im_info[:2]
+
+        mean = np.array(self.p.mean)
+        std = np.array(self.p.std)
+
+        cls_label_list = []
+        reg_target_list = []
+        reg_weight_list = []
+        for i in range(self.pyramid_levels):
+            p = self.p
+
+            cls_label_level = cls_label[anchor_size[i]:anchor_size[i+1]]
+            reg_target_level = reg_target[anchor_size[i]:anchor_size[i+1]]
+            reg_weight_level = reg_weight[anchor_size[i]:anchor_size[i+1]]
+            """
+            label: (h * w * A) -> (A * h * w)
+            bbox_target: (h * w * A, 4) -> (A * 4, h * w)
+            bbox_weight: (h * w * A, 4) -> (A * 4, h * w)
+            """
+            if h >= w:
+                fh, fw = p.generate.long[i], p.generate.short[i]
+            else:
+                fh, fw = p.generate.short[i], p.generate.long[i]
+
+            reg_target_level = (reg_target_level - mean) / std
+
+            cls_label_level = cls_label_level.reshape((fh, fw, -1)).transpose(2, 0, 1).reshape(-1)
+            reg_target_level = reg_target_level.reshape((fh, fw, -1)).transpose(2, 0, 1)
+            reg_weight_level = reg_weight_level.reshape((fh, fw, -1)).transpose(2, 0, 1)
+
+            reg_target_level = reg_target_level.reshape(-1, fh * fw)
+            reg_weight_level = reg_weight_level.reshape(-1, fh * fw)
+
+            cls_label_list.append(cls_label_level)
+            reg_target_list.append(reg_target_level)
+            reg_weight_list.append(reg_weight_level)
+
+        cls_label = np.concatenate(cls_label_list, axis=0)
+        reg_target = np.concatenate(reg_target_list, axis=1)
+        reg_weight = np.concatenate(reg_weight_list, axis=1)
+
+        input_record["rpn_cls_label"] = cls_label
+        input_record["rpn_reg_target"] = reg_target
+        input_record["rpn_reg_weight"] = reg_weight
+
+        return input_record["rpn_cls_label"], \
+               input_record["rpn_reg_target"], \
+               input_record["rpn_reg_weight"]
diff --git a/operator_cxx/contrib/deformable_convolution_im2col-inl.h b/operator_cxx/contrib/deformable_convolution_im2col-inl.h
new file mode 100644
index 0000000..49cbe0a
--- /dev/null
+++ b/operator_cxx/contrib/deformable_convolution_im2col-inl.h
@@ -0,0 +1,477 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file deformable_convolution-inl.h
+ * \brief
+ * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Yuntao Chen
+*/
+#ifndef MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_IM2COL_INL_H_
+#define MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_IM2COL_INL_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+#include <mxnet/operator.h>
+#include <mxnet/operator_util.h>
+#include <dmlc/logging.h>
+#include <dmlc/optional.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "../operator_common.h"
+#include "../nn/im2col.h"
+#include "./nn/deformable_im2col.h"
+#include "../linalg.h"
+
+
+namespace mxnet {
+namespace op {
+
+namespace conv {
+  enum DeformableConvolutionIm2ColOpInputs { kData, kOffset };
+  enum DeformableConvolutionIm2ColOpOutputs { kOut };
+  enum DeformableConvolutionIm2ColOpResource { kTempSpace };
+}
+
+struct DeformableConvolutionIm2ColParam : public dmlc::Parameter<DeformableConvolutionIm2ColParam> {
+  mxnet::TShape kernel;
+  mxnet::TShape stride;
+  mxnet::TShape dilate;
+  mxnet::TShape pad;
+  uint32_t num_filter;
+  uint32_t num_group;
+  uint32_t num_deformable_group;
+  uint64_t workspace;
+  bool no_bias;
+  dmlc::optional<int> layout;
+  DMLC_DECLARE_PARAMETER(DeformableConvolutionIm2ColParam) {
+    DMLC_DECLARE_FIELD(kernel).describe("Convolution kernel size: (h, w) or (d, h, w)");
+    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, -1))
+      .describe("Convolution stride: (h, w) or (d, h, w). Defaults to 1 for each dimension.");
+    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape(0, -1))
+      .describe("Convolution dilate: (h, w) or (d, h, w). Defaults to 1 for each dimension.");
+    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, -1))
+      .describe("Zero pad for convolution: (h, w) or (d, h, w). Defaults to no padding.");
+    DMLC_DECLARE_FIELD(num_filter).set_range(1, 100000)
+      .describe("Convolution filter(channel) number");
+    DMLC_DECLARE_FIELD(num_group).set_default(1)
+      .describe("Number of group partitions.");
+    DMLC_DECLARE_FIELD(num_deformable_group).set_default(1)
+      .describe("Number of deformable group partitions.");
+    DMLC_DECLARE_FIELD(workspace).set_default(1024).set_range(0, 8192)
+      .describe("Maximum temperal workspace allowed for convolution (MB).");
+    DMLC_DECLARE_FIELD(no_bias).set_default(false)
+      .describe("Whether to disable bias parameter.");
+    DMLC_DECLARE_FIELD(layout)
+      .add_enum("NCW", mshadow::kNCW)
+      .add_enum("NCHW", mshadow::kNCHW)
+      .add_enum("NCDHW", mshadow::kNCDHW)
+      .set_default(dmlc::optional<int>())
+      .describe("Set layout for input, output and weight. Empty for\n    "
+        "default layout: NCW for 1d, NCHW for 2d and NCDHW for 3d.");
+  }
+};
+
+template<typename xpu, typename DType>
+class DeformableConvolutionIm2ColOp : public Operator {
+ public:
+  explicit DeformableConvolutionIm2ColOp(DeformableConvolutionIm2ColParam p) {
+    this->param_ = p;
+    // convert MBytes first to Bytes and then to elements.
+    param_.workspace = (param_.workspace << 20) / sizeof(DType);
+    CHECK(param_.layout.value() == mshadow::kNCW ||
+      param_.layout.value() == mshadow::kNCHW ||
+      param_.layout.value() == mshadow::kNCDHW)
+      << "Only support NCW, NCHW and NCDHW layout";
+  }
+
+  virtual void Forward(const OpContext &ctx,
+    const std::vector<TBlob> &in_data,
+    const std::vector<OpReqType> &req,
+    const std::vector<TBlob> &out_data,
+    const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(req[conv::kOut], kWriteTo);
+    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(out_data.size(), 1U);
+    LayerSetUp(in_data[conv::kData].shape_,
+               in_data[conv::kOffset].shape_,
+               in_data[conv::kData].shape_);
+    Stream<xpu>* s = ctx.get_stream<xpu>();
+    // allocate workspace for col_buffer
+    Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
+      .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
+    // calculate the shape of col_buffer
+    mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, -1);
+    col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
+    for (size_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+      col_buffer_shape[i] = in_data[conv::kData].shape_[i + 1];
+    }
+    // create a column buffer using workspace and col_buffer_shape
+    TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
+
+    // initialize weight and col_buffer 3D tensors for using gemm
+    // index_t M = conv_out_channels_ / group_;
+    index_t N = conv_out_spatial_dim_;
+    index_t K = kernel_dim_;
+    Tensor<xpu, 2, DType> col_buffer_2d = col_buffer.get_with_shape<xpu, 2, DType>(
+      Shape2(K, N), s);
+    Tensor<xpu, 3, DType> output_3d = out_data[conv::kOut].get_with_shape<xpu, 3, DType>(
+      Shape3(num_, K, N), s);
+
+    for (index_t n = 0; n < num_; ++n) {
+      // transform image to col_buffer in order to use gemm
+      deformable_im2col(
+        s, 
+        in_data[conv::kData].dptr<DType>() + n*input_dim_,
+        in_data[conv::kOffset].dptr<DType>() + n*input_offset_dim_, 
+        in_data[conv::kData].shape_,
+        col_buffer.shape_, 
+        param_.kernel, 
+        param_.pad, 
+        param_.stride, 
+        param_.dilate,
+        param_.num_deformable_group, 
+        col_buffer.dptr<DType>());
+      Copy(output_3d[n], col_buffer_2d, s);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+    const std::vector<TBlob>& out_grad,
+    const std::vector<TBlob>& in_data,
+    const std::vector<TBlob>& out_data,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& in_grad,
+    const std::vector<TBlob>& aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1U);
+    size_t expected = 2;
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(req.size(), expected);
+    LayerSetUp(in_grad[conv::kData].shape_,
+               in_grad[conv::kOffset].shape_,
+               in_grad[conv::kData].shape_);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    // allocate workspace for col_buffer
+    Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
+      .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
+    // calculate the shape of col_buffer
+    mxnet::TShape col_buffer_shape(num_spatial_axes_ + 1, -1);
+    col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
+    for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+      col_buffer_shape[i] = in_grad[conv::kData].shape_[i + 1];
+    }
+
+    // create a column buffer using workspace and col_buffer_shape
+    TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
+
+    // initialize weight and col_buffer 3D tensors for using gemm
+    // For computing dLoss/d(in_data[kData])
+    index_t M = kernel_dim_;
+    index_t N = conv_out_spatial_dim_;
+    // index_t K = conv_out_channels_ / group_;
+    Tensor<xpu, 3, DType> out_grad_3d = out_grad[conv::kOut].get_with_shape<xpu, 3, DType>(
+      Shape3(num_, M, N), s);
+    Tensor<xpu, 2, DType> col_buffer_2d = col_buffer.get_with_shape<xpu, 2, DType>(
+      Shape2(M, N), s);
+    // For computing dLoss
+    Tensor<xpu, 1, DType> data_grad = in_grad[conv::kData].FlatTo1D<xpu, DType>(s);
+    data_grad = 0;
+    Tensor<xpu, 1, DType> offset_grad = in_grad[conv::kOffset].FlatTo1D<xpu, DType>(s);
+    offset_grad = 0;
+
+    for (index_t n = 0; n < num_; ++n) {
+      Tensor<xpu, 2, DType> out_grad_2d = out_grad_3d[n];
+      Copy(col_buffer_2d, out_grad_2d, s);
+
+      // gradient w.r.t. input data
+      deformable_col2im(
+        s, 
+        col_buffer.dptr<DType>(),
+        in_data[conv::kOffset].dptr<DType>() + n*input_offset_dim_,
+        in_grad[conv::kData].shape_, 
+        col_buffer.shape_,
+        param_.kernel, 
+        param_.pad, 
+        param_.stride, 
+        param_.dilate, 
+        param_.num_deformable_group,
+        in_grad[conv::kData].dptr<DType>() + n*input_dim_);
+    }
+  }
+
+ private:
+  void LayerSetUp(const mxnet::TShape& ishape,
+                  const mxnet::TShape& offset_shape,
+                  const mxnet::TShape& oshape) {
+    channel_axis_ = 1;  // hard code channel axis
+    const index_t first_spatial_axis = channel_axis_ + 1;
+    const index_t num_axes = param_.kernel.ndim() + 2;
+    num_spatial_axes_ = num_axes - first_spatial_axis;
+    is_1x1_ = true;
+    for (index_t i = 0; i < param_.kernel.ndim(); ++i) {
+      is_1x1_ &= param_.kernel[i] == 1 && param_.stride[i] == 1 && param_.pad[i] == 0;
+      if (!is_1x1_) break;
+    }
+
+    // batch size
+    num_ = ishape[0];
+    // number of input channels
+    channels_ = ishape[1];
+    group_ = param_.num_group;
+    conv_out_channels_ = channels_;
+    conv_in_channels_ = channels_;
+    kernel_dim_ = conv_in_channels_ / group_ * param_.kernel.Size();
+    weight_offset_ = conv_out_channels_ * kernel_dim_ / group_;
+    conv_out_spatial_dim_ = oshape.ProdShape(2, oshape.ndim());
+    col_offset_ = kernel_dim_ * conv_out_spatial_dim_;
+    output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / group_;
+    // size of the column buffer used for storing im2col-ed pixels
+    col_buffer_size_ = kernel_dim_ * group_ * conv_out_spatial_dim_;
+    // input/output image size (#channels * height * width)
+    input_dim_ = ishape.ProdShape(1, ishape.ndim());
+    input_offset_dim_ = offset_shape.ProdShape(1, offset_shape.ndim());
+    output_dim_ = oshape.ProdShape(1, oshape.ndim());
+    num_kernels_im2col_ = conv_in_channels_ * conv_out_spatial_dim_;
+    num_kernels_col2im_ = input_dim_;
+  }
+
+ private:
+  DeformableConvolutionIm2ColParam param_;
+  index_t channel_axis_;  // channel axis of the input
+  index_t channels_;  // number of channels of input image
+  index_t num_spatial_axes_;  // number of spatial axes
+  index_t num_;  // batch size
+  index_t group_;  // number of groups
+  index_t conv_out_channels_;  // number of output channels (num_filter)
+  index_t conv_out_spatial_dim_;  // number of pixels of output images per channel
+  index_t conv_in_channels_;  // number of input channels
+  index_t kernel_dim_;  // number of input channels per group * kernel size
+  index_t weight_offset_;  // number of output channels per group * kernel_dim_
+  index_t col_offset_;
+  index_t output_offset_;
+  index_t col_buffer_size_;
+  index_t input_dim_;
+  index_t input_offset_dim_;
+  index_t output_dim_;
+  index_t num_kernels_im2col_;
+  index_t num_kernels_col2im_;
+  bool is_1x1_;
+};  // class ConvolutionOp
+
+template<typename xpu>
+Operator* CreateOp(DeformableConvolutionIm2ColParam param, int dtype,
+  mxnet::ShapeVector *in_shape,
+  mxnet::ShapeVector *out_shape,
+  Context ctx);
+
+#if DMLC_USE_CXX11
+class DeformableConvolutionIm2ColProp : public OperatorProperty {
+ public:
+  std::vector<std::string> ListArguments() const override {
+    return{ "data", "offset" };
+  }
+
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    using namespace mshadow;
+    param_.Init(kwargs);
+    if (param_.kernel.ndim() == 2) {
+      param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
+      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
+      if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
+      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
+    } else {
+      LOG(FATAL) << "not implemented";
+    }
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(mxnet::ShapeVector *in_shape,
+    mxnet::ShapeVector *out_shape,
+    mxnet::ShapeVector *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, offset]";
+    out_shape->resize(1, mxnet::TShape());
+    const mxnet::TShape &dshp = (*in_shape)[conv::kData];
+    const mxnet::TShape &oshp = (*in_shape)[conv::kOffset];
+    if (dshp.ndim() == 0) return false;
+    if (param_.kernel.ndim() == 2) {
+      // 2d conv
+      CHECK_EQ(dshp.ndim(), 4U) \
+        << "Input data should be 4D in batch-num_filter-y-x";
+      CHECK_EQ(oshp.ndim(), 4U) \
+        << "Input offset should be 4D in batch-num_filter-y-x";
+      Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW);
+      Shape<4> offsetshape = ConvertLayout(oshp.get<4>(), param_.layout.value(), kNCHW);
+      // Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
+      //   param_.kernel[0], param_.kernel[1]);
+      // wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
+      // wshape[0] *= param_.num_group;
+
+      const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
+      const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
+      CHECK_EQ(dshape[1] % param_.num_group, 0U) \
+        << "input num_filter must divide group size";
+      CHECK_EQ(dshape[1] % param_.num_deformable_group, 0U) \
+        << "input num_filter must divide deformable group size";
+      CHECK_EQ(param_.num_filter % param_.num_group, 0U) \
+        << "output num_filter must divide group size";
+      CHECK_GT(param_.kernel.Size(), 0U) \
+        << "incorrect kernel size: " << param_.kernel;
+      CHECK_GT(param_.stride.Size(), 0U) \
+        << "incorrect stride size: " << param_.stride;
+      CHECK_GT(param_.dilate.Size(), 0U) \
+        << "incorrect dilate size: " << param_.dilate;
+      // Shape<4> oshape;
+      // oshape[0] = dshape[0];
+      // oshape[1] = param_.num_filter;
+      // oshape[2] = (dshape[2] + 2 * param_.pad[0] -
+      //   (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1;
+      // oshape[3] = (dshape[3] + 2 * param_.pad[1] -
+      //   (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1;
+      // SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
+      // CHECK_EQ(oshape[1] % param_.num_deformable_group, 0U) \
+      //   << "output num_filter must divide deformable group size";
+      // CHECK_EQ(oshape[2], offsetshape[2]) \
+      //   << "output height must equal to offset map height";
+      // CHECK_EQ(oshape[3], offsetshape[3]) \
+      //   << "output width must equal to offset map width";
+      // CHECK_EQ(offsetshape[1] % (param_.kernel[0] * param_.kernel[1]), 0U) \
+      //   << "offset filter must divide deformable group size";
+      // CHECK_EQ(offsetshape[1] / (2 * param_.kernel[0] * param_.kernel[1]), \
+      //          param_.num_deformable_group) \
+      //   << "offset filter must divide deformable group size";
+      Shape<4> oshape;
+      oshape[0] = dshape[0];
+      oshape[1] = dshape[1] * param_.kernel.Size();
+      oshape[2] = ((dshape[2] + 2 * param_.pad[0] - (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1);
+      oshape[3] = ((dshape[3] + 2 * param_.pad[1] - (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1);
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, oshape);
+      // CHECK_EQ(oshape[1] % param_.num_deformable_group, 0U) \
+      //   << "output num_filter must divide deformable group size";
+      // CHECK_EQ(oshape[2], offsetshape[2]) \
+      //   << "output height must equal to offset map height";
+      // CHECK_EQ(oshape[3], offsetshape[3]) \
+      //   << "output width must equal to offset map width";
+      CHECK_EQ(offsetshape[1] % (param_.kernel[0] * param_.kernel[1]), 0U) \
+        << "offset filter must divide deformable group size";
+      CHECK_EQ(offsetshape[1] / (2 * param_.kernel[0] * param_.kernel[1]), \
+               param_.num_deformable_group) \
+        << "offset filter must divide deformable group size";
+      // Perform incomplete shape inference. Fill in the missing values in data shape.
+      // 1) We can always fill in the batch_size.
+      // 2) We can back-calculate the input height/width if the corresponding stride is 1.
+      // oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW);
+      // dshape[0] = oshape[0];
+      // if (param_.stride[0] == 1) {
+      //   dshape[2] = oshape[2] + param_.dilate[0] * (ksize_y - 1) - 2 * param_.pad[0];
+      // }
+      // if (param_.stride[1] == 1) {
+      //   dshape[3] = oshape[3] + param_.dilate[1] * (ksize_x - 1) - 2 * param_.pad[1];
+      // }
+      // SHAPE_ASSIGN_CHECK(*in_shape, conv::kData,
+      //   ConvertLayout(dshape, kNCHW, param_.layout.value()));
+      // // Check whether the kernel sizes are valid
+      // if (dshape[2] != 0) {
+      //   CHECK_LE(ksize_y, dshape[2] + 2 * param_.pad[0]) << "kernel size exceed input";
+      // }
+      // if (dshape[3] != 0) {
+      //   CHECK_LE(ksize_x, dshape[3] + 2 * param_.pad[1]) << "kernel size exceed input";
+      // }
+      return true;
+    } else {
+      LOG(FATAL) << "not implemented";
+      return false;
+    }
+  }
+
+  bool InferType(std::vector<int> *in_type,
+    std::vector<int> *out_type,
+    std::vector<int> *aux_type) const override {
+    CHECK_GE(in_type->size(), 1U);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (size_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new DeformableConvolutionIm2ColProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "_contrib_DeformableConvolutionIm2Col";
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return{ out_grad[conv::kOut], in_data[conv::kData],
+            in_data[conv::kOffset] };
+  }
+
+  std::vector<ResourceRequest> ForwardResource(
+    const mxnet::ShapeVector &in_shape) const override {
+    return{ ResourceRequest::kTempSpace };
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+    const mxnet::ShapeVector &in_shape) const override {
+    return{ ResourceRequest::kTempSpace };
+  }
+
+  Operator* CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
+    std::vector<int> *in_type) const override;
+
+ private:
+  DeformableConvolutionIm2ColParam param_;
+};  // class ConvolutionProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_CONTRIB_DEFORMABLE_CONVOLUTION_IM2COL_INL_H_
diff --git a/operator_cxx/contrib/deformable_convolution_im2col.cc b/operator_cxx/contrib/deformable_convolution_im2col.cc
new file mode 100644
index 0000000..30613dc
--- /dev/null
+++ b/operator_cxx/contrib/deformable_convolution_im2col.cc
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file deformable_convolution_im2col.cc
+ * \brief
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Chenxia Han, Yuntao Chen
+*/
+
+#include "./deformable_convolution_im2col-inl.h"
+
+namespace mxnet {
+namespace op {
+DMLC_REGISTER_PARAMETER(DeformableConvolutionIm2ColParam);
+
+template<>
+Operator* CreateOp<cpu>(DeformableConvolutionIm2ColParam param, int dtype,
+                        mxnet::ShapeVector *in_shape,
+                        mxnet::ShapeVector *out_shape,
+                        Context ctx) {
+  Operator *op = nullptr;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new DeformableConvolutionIm2ColOp<cpu, DType>(param);
+  })
+  return op;
+}
+
+// DO_BIND_DISPATCH comes from operator_common.h
+Operator *DeformableConvolutionIm2ColProp::CreateOperatorEx(Context ctx,
+                                            mxnet::ShapeVector *in_shape,
+                                            std::vector<int> *in_type) const {
+  mxnet::ShapeVector out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx);
+}
+
+MXNET_REGISTER_OP_PROPERTY(_contrib_DeformableConvolutionIm2Col, DeformableConvolutionIm2ColProp)
+.describe(R"code(Compute 2-D deformable convolution on 4-D input.
+
+The deformable convolution operation is described in https://arxiv.org/abs/1703.06211
+
+For 2-D deformable convolution, the shapes are
+
+- **data**: *(batch_size, channel, height, width)*
+- **offset**: *(batch_size, num_deformable_group * kernel[0] * kernel[1], height, width)*
+- **out**: *(batch_size, channel * kernel[0] * kernel[1], out_height * out_width)*.
+
+Define::
+
+  f(x,k,p,s,d) = floor((x+2*p-d*(k-1)-1)/s)+1
+
+then we have::
+
+  out_height=f(height, kernel[0], pad[0], stride[0], dilate[0])
+  out_width=f(width, kernel[1], pad[1], stride[1], dilate[1])
+
+If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
+
+The default data ``layout`` is *NCHW*, namely *(batch_size, channle, height,
+width)*.
+
+If ``num_group`` is larger than 1, denoted by *g*, then split the input ``data``
+evenly into *g* parts along the channel axis, and also evenly split ``weight``
+along the first dimension. Next compute the convolution on the *i*-th part of
+the data with the *i*-th weight part. The output is obtained by concating all
+the *g* results.
+
+If ``num_deformable_group`` is larger than 1, denoted by *dg*, then split the
+input ``offset`` evenly into *dg* parts along the channel axis, and also evenly
+split ``out`` evenly into *dg* parts along the channel axis. Next compute the
+deformable convolution, apply the *i*-th part of the offset part on the *i*-th
+out.
+
+
+Both ``weight`` and ``bias`` are learnable parameters.
+
+
+)code" ADD_FILELINE)
+.add_argument("data", "NDArray-or-Symbol", "Input data to the DeformableConvolutionIm2ColOp.")
+.add_argument("offset", "NDArray-or-Symbol", "Input offset to the DeformableConvolutionIm2ColOp.")
+.add_arguments(DeformableConvolutionIm2ColParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/operator_cxx/contrib/deformable_convolution_im2col.cu b/operator_cxx/contrib/deformable_convolution_im2col.cu
new file mode 100644
index 0000000..f054c22
--- /dev/null
+++ b/operator_cxx/contrib/deformable_convolution_im2col.cu
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The Apache-2.0 License [see LICENSE for details]
+ * \file deformable_convolution_im2col.cu
+ * \brief
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Chenxia Han, Yuntao Chen
+*/
+
+#include "./deformable_convolution_im2col-inl.h"
+#include <vector>
+
+namespace mxnet {
+namespace op {
+
+  template<>
+  Operator* CreateOp<gpu>(DeformableConvolutionIm2ColParam param, int dtype,
+    mxnet::ShapeVector *in_shape,
+    mxnet::ShapeVector *out_shape,
+    Context ctx) {
+    Operator *op = NULL;
+    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      op = new DeformableConvolutionIm2ColOp<gpu, DType>(param);
+    })
+      return op;
+  }
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/operator_cxx/contrib/get_anchor_offset-inl.h b/operator_cxx/contrib/get_anchor_offset-inl.h
new file mode 100644
index 0000000..62238e5
--- /dev/null
+++ b/operator_cxx/contrib/get_anchor_offset-inl.h
@@ -0,0 +1,277 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file get_anchor_offset-inl.h
+ * \brief GetAnchorOffset Operator
+ * \author Chenxia Han
+*/
+#ifndef MXNET_OPERATOR_CONTRIB_GET_ANCHOR_OFFSET_INL_H_
+#define MXNET_OPERATOR_CONTRIB_GET_ANCHOR_OFFSET_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include <ctime>
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include "../operator_common.h"
+#include "../mshadow_op.h"
+
+namespace mxnet {
+namespace op {
+
+namespace get_anchor_offset {
+enum GetAnchorOffsetOpInputs {kData, kAnchor};
+enum GetAnchorOffsetOpOutputs {kOut};
+}  // get_anchor_offset
+
+struct GetAnchorOffsetParam : public dmlc::Parameter<GetAnchorOffsetParam> {
+  mxnet::TShape kernel;
+  int stride;
+  DMLC_DECLARE_PARAMETER(GetAnchorOffsetParam) {
+    DMLC_DECLARE_FIELD(kernel).describe("Sample size for each anchor: (h, w)");
+    DMLC_DECLARE_FIELD(stride).describe("Stride at current layer");
+  }
+};
+
+struct anchor_to_offset {
+  template <typename DType>
+  MSHADOW_XINLINE static void Map(int i, int k1, int k2, int stride,
+                                  int num_anchors, int height, int width,
+                                  const DType *anchor, DType *out) {
+    int w = i % width;
+    int h = i / width % height;
+    int a = i / width / height % num_anchors;
+    int n = i / width / height / num_anchors;
+
+    int num_offset = num_anchors * k1 * k2 * 2;
+
+    for (int kh = 0; kh < k1; ++kh) {
+      for (int kw = 0; kw < k2; ++kw) {
+        const DType *box = anchor + ((((n * height) + h) * width + w) * num_anchors + a) * 4;
+        DType x1 = box[0] / stride;
+        DType y1 = box[1] / stride;
+        DType x2 = box[2] / stride;
+        DType y2 = box[3] / stride;
+
+        DType bin_size_x = (x2 - x1 + 1) / k2;
+        DType bin_size_y = (y2 - y1 + 1) / k1;
+
+        int offset_idx = ((a * k1 + kh) * k2 + kw) * 2;
+        int offset_idx_x = ((n * num_offset + offset_idx + 1) * height + h) * width + w;
+        int offset_idx_y = ((n * num_offset + offset_idx + 0) * height + h) * width + w;
+
+        out[offset_idx_x] = x1 + (bin_size_x-1) / 2 + kw * bin_size_x;
+        out[offset_idx_y] = y1 + (bin_size_y-1) / 2 + kh * bin_size_y;
+
+        out[offset_idx_x] -= w + kw - (k2-1) / 2;
+        out[offset_idx_y] -= h + kh - (k1-1) / 2;
+      }
+    }
+  }
+};
+
+template<typename xpu, typename DType>
+class GetAnchorOffsetOp : public Operator {
+ public:
+  explicit GetAnchorOffsetOp(GetAnchorOffsetParam param) {
+    this->param_ = param;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    using namespace mxnet_op;
+    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(out_data.size(), 1U);
+    CHECK_EQ(req.size(), 1U);
+    CHECK_EQ(req[get_anchor_offset::kOut], kWriteTo);
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+
+    /*
+     * data:    (n, c, h, w)
+     * anchor:  (n, h * w * a, 4)
+     * out:     (n, a * k_1 * k_2 * 2, h, w)
+     */
+    Tensor<xpu, 4, DType> data = in_data[get_anchor_offset::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 3, DType> anchor = in_data[get_anchor_offset::kAnchor].get<xpu, 3, DType>(s);
+    Tensor<xpu, 4, DType> out = out_data[get_anchor_offset::kOut].get<xpu, 4, DType>(s);
+
+    int height = data.size(2);
+    int width = data.size(3);
+    int num_anchors = anchor.size(1) / height / width;
+    int count = anchor.shape_.ProdShape(0, 2);
+
+    Kernel<anchor_to_offset, xpu>::Launch(s, count, param_.kernel[0], param_.kernel[1],
+      param_.stride, num_anchors, height, width, anchor.dptr_, out.dptr_);
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_states) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_grad.size(), 2U);
+
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4, DType> gdata  = in_grad[get_anchor_offset::kData].get<xpu, 4, DType>(s);
+    Tensor<xpu, 3, DType> ganchor = in_grad[get_anchor_offset::kAnchor].get<xpu, 3, DType>(s);
+
+    Assign(gdata, req[get_anchor_offset::kData], 0);
+    Assign(ganchor, req[get_anchor_offset::kAnchor], 0);
+  }
+
+ private:
+  GetAnchorOffsetParam param_;
+};  // class GetAnchorOffsetOp
+
+template<typename xpu>
+Operator *CreateOp(GetAnchorOffsetParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class GetAnchorOffsetProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 2) << "Input:[data, anchor]";
+    const TShape &dshape = in_shape->at(get_anchor_offset::kData);
+    const TShape &ashape = in_shape->at(get_anchor_offset::kAnchor);
+
+    const int num_image = dshape[0];
+    const int channel = dshape[1];
+    const int height = dshape[2];
+    const int width = dshape[3];
+    const int num_anchors = ashape[1] / (height * width);
+    const TShape &kernel = param_.kernel;
+
+    auto data_shape = Shape4(num_image, channel, height, width);
+    auto anchor_shape = Shape3(num_image, height * width * num_anchors, 4);
+    auto offset_shape = Shape4(num_image, num_anchors * kernel[0] * kernel[1] * 2, height, width);
+
+    SHAPE_ASSIGN_CHECK(*in_shape, get_anchor_offset::kData, data_shape);
+    SHAPE_ASSIGN_CHECK(*in_shape, get_anchor_offset::kAnchor, anchor_shape);
+
+    out_shape->clear();
+    // output
+    out_shape->push_back(offset_shape);
+    return true;
+  }
+
+  bool InferType(std::vector<int> *in_type,
+                 std::vector<int> *out_type,
+                 std::vector<int> *aux_type) const override {
+    CHECK_EQ(in_type->size(), 2U);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (size_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
+  }
+
+  OperatorProperty* Copy() const override {
+    auto ptr = new GetAnchorOffsetProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  std::string TypeString() const override {
+    return "_contrib_GetAnchorOffset";
+  }
+
+  int NumOutputs() const override {
+    return 1;
+  }
+
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "anchor"};
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    return {"output"};
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+    const std::vector<int> &out_grad,
+    const std::vector<int> &in_data,
+    const std::vector<int> &out_data) const override {
+    return {};
+  }
+
+  /*
+  std::vector<ResourceRequest> BackwardResource(
+    const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<std::pair<int, void*> > ForwardInplaceOption(
+    const std::vector<int> &in_data,
+    const std::vector<void*> &out_data) const override {
+    return {{in_data[get_anchor_offset::kData], out_data[get_anchor_offset::kOut]}};
+  }
+  */
+
+  Operator *CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator *CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  GetAnchorOffsetParam param_;
+};  // class GetAnchorOffsetProp
+
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+
+#endif  //  MXNET_OPERATOR_CONTRIB_GET_ANCHOR_OFFSET_INL_H_
diff --git a/operator_cxx/contrib/get_anchor_offset.cc b/operator_cxx/contrib/get_anchor_offset.cc
new file mode 100644
index 0000000..ca1b70b
--- /dev/null
+++ b/operator_cxx/contrib/get_anchor_offset.cc
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file get_anchor_offset.cc
+ * \brief
+ * \author Chenxia Han
+*/
+
+#include "./get_anchor_offset-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<>
+Operator *CreateOp<cpu>(GetAnchorOffsetParam param, int dtype) {
+  Operator *op = nullptr;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new GetAnchorOffsetOp<cpu, DType>(param);
+  });
+  return op;
+}
+
+Operator *GetAnchorOffsetProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                         std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
+}
+
+DMLC_REGISTER_PARAMETER(GetAnchorOffsetParam);
+
+MXNET_REGISTER_OP_PROPERTY(_contrib_GetAnchorOffset, GetAnchorOffsetProp)
+.describe("Compute offset for Deformable Convolution")
+.add_argument("data", "NDArray-or-Symbol", "Data to determine height and width")
+.add_argument("anchor", "NDArray-or-Symbol", "Anchor to determine offset")
+.add_arguments(GetAnchorOffsetParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/operator_cxx/contrib/get_anchor_offset.cu b/operator_cxx/contrib/get_anchor_offset.cu
new file mode 100644
index 0000000..41ce243
--- /dev/null
+++ b/operator_cxx/contrib/get_anchor_offset.cu
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file get_anchor_offset.cu
+ * \brief
+ * \author Chenxia Han
+*/
+
+#include "./get_anchor_offset-inl.h"
+
+namespace mxnet {
+namespace op {
+
+template<>
+Operator *CreateOp<gpu>(GetAnchorOffsetParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new GetAnchorOffsetOp<gpu, DType>(param);
+  });
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet