| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| import mxnet as mx |
| from . import proposal_target |
| |
| |
| def get_vgg_feature(data): |
| # group 1 |
| conv1_1 = mx.symbol.Convolution( |
| data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, workspace=2048, name="conv1_1") |
| relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1") |
| conv1_2 = mx.symbol.Convolution( |
| data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, workspace=2048, name="conv1_2") |
| relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2") |
| pool1 = mx.symbol.Pooling( |
| data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool1") |
| # group 2 |
| conv2_1 = mx.symbol.Convolution( |
| data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, workspace=2048, name="conv2_1") |
| relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1") |
| conv2_2 = mx.symbol.Convolution( |
| data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, workspace=2048, name="conv2_2") |
| relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2") |
| pool2 = mx.symbol.Pooling( |
| data=relu2_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool2") |
| # group 3 |
| conv3_1 = mx.symbol.Convolution( |
| data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, workspace=2048, name="conv3_1") |
| relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1") |
| conv3_2 = mx.symbol.Convolution( |
| data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, workspace=2048, name="conv3_2") |
| relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2") |
| conv3_3 = mx.symbol.Convolution( |
| data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, workspace=2048, name="conv3_3") |
| relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3") |
| pool3 = mx.symbol.Pooling( |
| data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool3") |
| # group 4 |
| conv4_1 = mx.symbol.Convolution( |
| data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv4_1") |
| relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1") |
| conv4_2 = mx.symbol.Convolution( |
| data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv4_2") |
| relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2") |
| conv4_3 = mx.symbol.Convolution( |
| data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv4_3") |
| relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3") |
| pool4 = mx.symbol.Pooling( |
| data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool4") |
| # group 5 |
| conv5_1 = mx.symbol.Convolution( |
| data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv5_1") |
| relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1") |
| conv5_2 = mx.symbol.Convolution( |
| data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv5_2") |
| relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2") |
| conv5_3 = mx.symbol.Convolution( |
| data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv5_3") |
| relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3") |
| |
| return relu5_3 |
| |
| |
| def get_vgg_top_feature(data): |
| # group 6 |
| flatten = mx.symbol.Flatten(data=data, name="flatten") |
| fc6 = mx.symbol.FullyConnected(data=flatten, num_hidden=4096, name="fc6") |
| relu6 = mx.symbol.Activation(data=fc6, act_type="relu", name="relu6") |
| drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6") |
| # group 7 |
| fc7 = mx.symbol.FullyConnected(data=drop6, num_hidden=4096, name="fc7") |
| relu7 = mx.symbol.Activation(data=fc7, act_type="relu", name="relu7") |
| drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7") |
| return drop7 |
| |
| |
| |
| def get_vgg_train(anchor_scales, anchor_ratios, rpn_feature_stride, |
| rpn_pre_topk, rpn_post_topk, rpn_nms_thresh, rpn_min_size, rpn_batch_rois, |
| num_classes, rcnn_feature_stride, rcnn_pooled_size, rcnn_batch_size, |
| rcnn_batch_rois, rcnn_fg_fraction, rcnn_fg_overlap, rcnn_bbox_stds): |
| num_anchors = len(anchor_scales) * len(anchor_ratios) |
| |
| data = mx.symbol.Variable(name="data") |
| im_info = mx.symbol.Variable(name="im_info") |
| gt_boxes = mx.symbol.Variable(name="gt_boxes") |
| rpn_label = mx.symbol.Variable(name='label') |
| rpn_bbox_target = mx.symbol.Variable(name='bbox_target') |
| rpn_bbox_weight = mx.symbol.Variable(name='bbox_weight') |
| |
| # shared convolutional layers |
| conv_feat = get_vgg_feature(data) |
| |
| # RPN layers |
| rpn_conv = mx.symbol.Convolution( |
| data=conv_feat, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3") |
| rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu") |
| |
| # rpn classification |
| rpn_cls_score = mx.symbol.Convolution( |
| data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score") |
| rpn_cls_score_reshape = mx.symbol.Reshape( |
| data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape") |
| rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape, label=rpn_label, multi_output=True, |
| normalization='valid', use_ignore=True, ignore_label=-1, name="rpn_cls_prob") |
| rpn_cls_act = mx.symbol.softmax( |
| data=rpn_cls_score_reshape, axis=1, name="rpn_cls_act") |
| rpn_cls_act_reshape = mx.symbol.Reshape( |
| data=rpn_cls_act, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_act_reshape') |
| |
| # rpn bbox regression |
| rpn_bbox_pred = mx.symbol.Convolution( |
| data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred") |
| rpn_bbox_loss_ = rpn_bbox_weight * mx.symbol.smooth_l1(name='rpn_bbox_loss_', scalar=3.0, data=(rpn_bbox_pred - rpn_bbox_target)) |
| rpn_bbox_loss = mx.sym.MakeLoss(name='rpn_bbox_loss', data=rpn_bbox_loss_, grad_scale=1.0 / rpn_batch_rois) |
| |
| # rpn proposal |
| rois = mx.symbol.contrib.MultiProposal( |
| cls_prob=rpn_cls_act_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', |
| feature_stride=rpn_feature_stride, scales=anchor_scales, ratios=anchor_ratios, |
| rpn_pre_nms_top_n=rpn_pre_topk, rpn_post_nms_top_n=rpn_post_topk, |
| threshold=rpn_nms_thresh, rpn_min_size=rpn_min_size) |
| |
| # rcnn roi proposal target |
| group = mx.symbol.Custom(rois=rois, gt_boxes=gt_boxes, op_type='proposal_target', |
| num_classes=num_classes, batch_images=rcnn_batch_size, |
| batch_rois=rcnn_batch_rois, fg_fraction=rcnn_fg_fraction, |
| fg_overlap=rcnn_fg_overlap, box_stds=rcnn_bbox_stds) |
| rois = group[0] |
| label = group[1] |
| bbox_target = group[2] |
| bbox_weight = group[3] |
| |
| # rcnn roi pool |
| roi_pool = mx.symbol.ROIPooling( |
| name='roi_pool', data=conv_feat, rois=rois, pooled_size=rcnn_pooled_size, spatial_scale=1.0 / rcnn_feature_stride) |
| |
| # rcnn top feature |
| top_feat = get_vgg_top_feature(roi_pool) |
| |
| # rcnn classification |
| cls_score = mx.symbol.FullyConnected(name='cls_score', data=top_feat, num_hidden=num_classes) |
| cls_prob = mx.symbol.SoftmaxOutput(name='cls_prob', data=cls_score, label=label, normalization='batch') |
| |
| # rcnn bbox regression |
| bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=top_feat, num_hidden=num_classes * 4) |
| bbox_loss_ = bbox_weight * mx.symbol.smooth_l1(name='bbox_loss_', scalar=1.0, data=(bbox_pred - bbox_target)) |
| bbox_loss = mx.sym.MakeLoss(name='bbox_loss', data=bbox_loss_, grad_scale=1.0 / rcnn_batch_rois) |
| |
| # reshape output |
| label = mx.symbol.Reshape(data=label, shape=(rcnn_batch_size, -1), name='label_reshape') |
| cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(rcnn_batch_size, -1, num_classes), name='cls_prob_reshape') |
| bbox_loss = mx.symbol.Reshape(data=bbox_loss, shape=(rcnn_batch_size, -1, 4 * num_classes), name='bbox_loss_reshape') |
| |
| # group output |
| group = mx.symbol.Group([rpn_cls_prob, rpn_bbox_loss, cls_prob, bbox_loss, mx.symbol.BlockGrad(label)]) |
| return group |
| |
| |
| def get_vgg_test(anchor_scales, anchor_ratios, rpn_feature_stride, |
| rpn_pre_topk, rpn_post_topk, rpn_nms_thresh, rpn_min_size, |
| num_classes, rcnn_feature_stride, rcnn_pooled_size, rcnn_batch_size): |
| num_anchors = len(anchor_scales) * len(anchor_ratios) |
| |
| data = mx.symbol.Variable(name="data") |
| im_info = mx.symbol.Variable(name="im_info") |
| |
| # shared convolutional layers |
| conv_feat = get_vgg_feature(data) |
| |
| # rpn feature |
| rpn_conv = mx.symbol.Convolution( |
| data=conv_feat, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3") |
| rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu") |
| |
| # rpn classification |
| rpn_cls_score = mx.symbol.Convolution( |
| data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score") |
| rpn_cls_score_reshape = mx.symbol.Reshape( |
| data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape") |
| rpn_cls_act = mx.symbol.softmax( |
| data=rpn_cls_score_reshape, axis=1, name="rpn_cls_act") |
| rpn_cls_act_reshape = mx.symbol.Reshape( |
| data=rpn_cls_act, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_act_reshape') |
| |
| # rpn bbox regression |
| rpn_bbox_pred = mx.symbol.Convolution( |
| data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred") |
| |
| # rpn proposal |
| rois = mx.symbol.contrib.MultiProposal( |
| cls_prob=rpn_cls_act_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', |
| feature_stride=rpn_feature_stride, scales=anchor_scales, ratios=anchor_ratios, |
| rpn_pre_nms_top_n=rpn_pre_topk, rpn_post_nms_top_n=rpn_post_topk, |
| threshold=rpn_nms_thresh, rpn_min_size=rpn_min_size) |
| |
| # rcnn roi pool |
| roi_pool = mx.symbol.ROIPooling( |
| name='roi_pool', data=conv_feat, rois=rois, pooled_size=rcnn_pooled_size, spatial_scale=1.0 / rcnn_feature_stride) |
| |
| # rcnn top feature |
| top_feat = get_vgg_top_feature(roi_pool) |
| |
| # rcnn classification |
| cls_score = mx.symbol.FullyConnected(name='cls_score', data=top_feat, num_hidden=num_classes) |
| cls_prob = mx.symbol.softmax(name='cls_prob', data=cls_score) |
| |
| # rcnn bbox regression |
| bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=top_feat, num_hidden=num_classes * 4) |
| |
| # reshape output |
| cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(rcnn_batch_size, -1, num_classes), name='cls_prob_reshape') |
| bbox_pred = mx.symbol.Reshape(data=bbox_pred, shape=(rcnn_batch_size, -1, 4 * num_classes), name='bbox_pred_reshape') |
| |
| # group output |
| group = mx.symbol.Group([rois, cls_prob, bbox_pred]) |
| return group |