- object_detectionAPI源码阅读笔记(7-Fast
- object_detectionAPI源码阅读笔记(12-fas
- object_detectionAPI源码阅读笔记(8-fast
- object_detectionAPI源码阅读笔记(6-fast
- object_detectionAPI源码阅读笔记(5-mode
- object_detectionAPI源码阅读笔记(4-mode
- object_detectionAPI源码阅读笔记(3-trai
- object_detectionAPI源码阅读笔记(0--开始)
- object_detectionAPI源码阅读笔记(10-voc
- xgboost和lda学习
FasterRCNNMetaArch的详解:
上篇说到init函数就是对参数的提取如下:
- init()
def __init__(self,
is_training,
num_classes,
image_resizer_fn,
feature_extractor,
first_stage_only,
first_stage_anchor_generator,
first_stage_atrous_rate,
first_stage_box_predictor_arg_scope,
first_stage_box_predictor_kernel_size,
first_stage_box_predictor_depth,
first_stage_minibatch_size,
first_stage_positive_balance_fraction,
first_stage_nms_score_threshold,
first_stage_nms_iou_threshold,
first_stage_max_proposals,
first_stage_localization_loss_weight,
first_stage_objectness_loss_weight,
initial_crop_size,
maxpool_kernel_size,
maxpool_stride,
second_stage_mask_rcnn_box_predictor,
second_stage_batch_size,
second_stage_balance_fraction,
second_stage_non_max_suppression_fn,
second_stage_score_conversion_fn,
second_stage_localization_loss_weight,
second_stage_classification_loss_weight,
second_stage_classification_loss,
second_stage_mask_prediction_loss_weight=1.0,
hard_example_miner=None,
parallel_iterations=16):
super(FasterRCNNMetaArch, self).__init__(num_classes=num_classes)
# 检查参数是否正确
if is_training and second_stage_batch_size > first_stage_max_proposals:
raise ValueError('second_stage_batch_size should be no greater than '
'first_stage_max_proposals.')
if not isinstance(first_stage_anchor_generator,
grid_anchor_generator.GridAnchorGenerator):
raise ValueError('first_stage_anchor_generator must be of type '
'grid_anchor_generator.GridAnchorGenerator.')
# 获取参数,这些都是设置参数
self._is_training = is_training
self._image_resizer_fn = image_resizer_fn # 图片resize函数
self._feature_extractor = feature_extractor # feature_extractor提取函数,在上面有介绍
self._first_stage_only = first_stage_only # 是否只进行区域提取
# The first class is reserved as background.
# 设置第一个类为背景类
unmatched_cls_target = tf.constant(
[1] + self._num_classes * [0], dtype=tf.float32)
# target_assigner是创建任务的类
self._proposal_target_assigner = target_assigner.create_target_assigner(
'FasterRCNN', 'proposal')
self._detector_target_assigner = target_assigner.create_target_assigner(
'FasterRCNN', 'detection', unmatched_cls_target=unmatched_cls_target)
# Both proposal and detector target assigners use the same box coder
self._box_coder = self._proposal_target_assigner.box_coder
# (First stage) Region proposal network parameters
# 获取第一阶段的anchor_generator生成器
self._first_stage_anchor_generator = first_stage_anchor_generator
self._first_stage_atrous_rate = first_stage_atrous_rate
self._first_stage_box_predictor_arg_scope = (
first_stage_box_predictor_arg_scope)
self._first_stage_box_predictor_kernel_size = (
first_stage_box_predictor_kernel_size)
self._first_stage_box_predictor_depth = first_stage_box_predictor_depth
self._first_stage_minibatch_size = first_stage_minibatch_size
# 在这里进行正负样本的采样
self._first_stage_sampler = sampler.BalancedPositiveNegativeSampler(
positive_fraction=first_stage_positive_balance_fraction)
self._first_stage_box_predictor = box_predictor.ConvolutionalBoxPredictor(
self._is_training, num_classes=1,
conv_hyperparams=self._first_stage_box_predictor_arg_scope,
min_depth=0, max_depth=0, num_layers_before_predictor=0,
use_dropout=False, dropout_keep_prob=1.0, kernel_size=1,
box_code_size=self._box_coder.code_size)
# 第一阶段的非极大抑制值,iou,最大推荐区域数量
self._first_stage_nms_score_threshold = first_stage_nms_score_threshold
self._first_stage_nms_iou_threshold = first_stage_nms_iou_threshold
self._first_stage_max_proposals = first_stage_max_proposals
# 产生WeightedSmoothL1LocalizationLoss和WeightedSoftmaxClassificationLoss
self._first_stage_localization_loss = (
losses.WeightedSmoothL1LocalizationLoss(anchorwise_output=True))
self._first_stage_objectness_loss = (
losses.WeightedSoftmaxClassificationLoss(anchorwise_output=True))
self._first_stage_loc_loss_weight = first_stage_localization_loss_weight
self._first_stage_obj_loss_weight = first_stage_objectness_loss_weight
# Per-region cropping parameters
# 设置ROI的大小
self._initial_crop_size = initial_crop_size
self._maxpool_kernel_size = maxpool_kernel_size
self._maxpool_stride = maxpool_stride
self._mask_rcnn_box_predictor = second_stage_mask_rcnn_box_predictor
# 还是提取第二阶段的参数,
self._second_stage_batch_size = second_stage_batch_size
self._second_stage_sampler = sampler.BalancedPositiveNegativeSampler(
positive_fraction=second_stage_balance_fraction)
# 第二阶段非极大抑制值,iou,最大推荐区域数量
self._second_stage_nms_fn = second_stage_non_max_suppression_fn
self._second_stage_score_conversion_fn = second_stage_score_conversion_fn
# 第二阶段的loss
self._second_stage_localization_loss = (
losses.WeightedSmoothL1LocalizationLoss(anchorwise_output=True))
self._second_stage_classification_loss = second_stage_classification_loss
self._second_stage_mask_loss = (
losses.WeightedSigmoidClassificationLoss(anchorwise_output=True))
self._second_stage_loc_loss_weight = second_stage_localization_loss_weight
self._second_stage_cls_loss_weight = second_stage_classification_loss_weight
self._second_stage_mask_loss_weight = (
second_stage_mask_prediction_loss_weight)
self._hard_example_miner = hard_example_miner
self._parallel_iterations = parallel_iterations
- FasterRCNNMetaArch的内部属性
@property
def first_stage_feature_extractor_scope(self):
return 'FirstStageFeatureExtractor'
@property
def second_stage_feature_extractor_scope(self):
return 'SecondStageFeatureExtractor'
@property
def first_stage_box_predictor_scope(self):
return 'FirstStageBoxPredictor'
@property
def second_stage_box_predictor_scope(self):
return 'SecondStageBoxPredictor'
@property
def max_num_proposals(self):
if self._is_training and not self._hard_example_miner:
return self._second_stage_batch_size
return self._first_stage_max_proposals
其中max_num_proposals():
是的batch中每张图的最大的建议区域的数量属性。
在训练时如果hardexample miner
没有设置使用second_stage_batch_size
否则使用first_stage_max_proposals
而在进行推断时使用的总是first_stage_max_proposals
.
- preprocess(self, inputs)
def preprocess(self, inputs):
if inputs.dtype is not tf.float32:
raise ValueError('`preprocess` expects a tf.float32 tensor')
with tf.name_scope('Preprocessor'):
resized_inputs = tf.map_fn(self._image_resizer_fn,
elems=inputs,
dtype=tf.float32,
parallel_iterations=self._parallel_iterations)
return self._feature_extractor.preprocess(resized_inputs)
这是调用FasterRCNNFeatureExtractor.preprocess()函数进行负责额外的预处理(例如缩放像素值在[-1,1]中),感觉很一般啊。请看object_detectionAPI源码阅读笔记(8-faster_rcnn_inception_resnet_v2_feature_extractor.py)
- predict(self, preprocessed_inputs)
def predict(self, preprocessed_inputs):
(rpn_box_predictor_features, rpn_features_to_crop, anchors_boxlist,
image_shape) = self._extract_rpn_feature_maps(preprocessed_inputs)
(rpn_box_encodings, rpn_objectness_predictions_with_background
) = self._predict_rpn_proposals(rpn_box_predictor_features)
# The Faster R-CNN paper recommends pruning anchors that venture outside
# the image window at training time and clipping at inference time.
clip_window = tf.to_float(tf.stack([0, 0, image_shape[1], image_shape[2]]))
if self._is_training:
(rpn_box_encodings, rpn_objectness_predictions_with_background,
anchors_boxlist) = self._remove_invalid_anchors_and_predictions(
rpn_box_encodings, rpn_objectness_predictions_with_background,
anchors_boxlist, clip_window)
else:
anchors_boxlist = box_list_ops.clip_to_window(
anchors_boxlist, clip_window)
anchors = anchors_boxlist.get()
prediction_dict = {
'rpn_box_predictor_features': rpn_box_predictor_features,
'rpn_features_to_crop': rpn_features_to_crop,
'image_shape': image_shape,
'rpn_box_encodings': rpn_box_encodings,
'rpn_objectness_predictions_with_background':
rpn_objectness_predictions_with_background,
'anchors': anchors
}
if not self._first_stage_only:
prediction_dict.update(self._predict_second_stage(
rpn_box_encodings,
rpn_objectness_predictions_with_background,
rpn_features_to_crop,
anchors, image_shape))
return prediction_dict
这个函数是对preprocessed_inputs处理的图像进行前向处理,产生最原始的预测。如果 first_stage_only
被设置为True,这个方程就会输出RPN predictions (un-postprocessed).否则就会输出first stage RPN predictions和second stage box classifier predictions.
其他需要注意的地方:
+ Anchor pruning vs. clipping: 按照Faster R-CNN paper建议, 在训练时删掉锚点边界超出图片的边界而在进行推断时(预测)我们仅仅修建这些锚点。
+ Proposal padding:每一个批次的区域建议数量都会被扩充到self._max_num_proposals(在训练时,一般时正样本不够,拿负样本进行填充,假如self._max_num_proposals==128,正负样本相加必需等于这个数)所以每批次的batch size 是一样的。
Args:
preprocessed_inputs: shape=[batch, height, width, channels] 的一张经过preprocessed处理的图片。
Returns:
prediction_dict: a dictionary holding "raw" prediction tensors:
1) rpn_box_predictor_features: shape = [batch_size, height, width, depth] 由rpn_box_predictor_features提取的一张特征图。是用来预测proposal boxes和相应的目标为前景还是背景的得分(背景得分只有[0,1])。
2) rpn_features_to_crop: shape=[batch_size, height, width, depth]用于给RPN的特征图。(RPN时给任意尺寸一张特征图输出固定的尺寸的特征图)
3) image_shape: a 1-D 代表input image shape.
4) rpn_box_encodings: shape= [batch_size, num_anchors, self._box_coder.code_size],是预测框的形状
5) rpn_objectness_predictions_with_background: shape=[batch_size, num_anchors, 2]每个锚点的类别 (logits),包含了背景预测在 (at class index 0).
6) anchors: shape = [num_anchors, 4] 代表first stage RPN (绝对坐标)的坐标. `num_anchors` 在训练和推断时是不一样的。
--------------------------------------------------------------------------
接下来是第二阶段才会返回的值。
7) refined_box_encodings: shape=[total_num_proposals, num_classes, 4] 经过过滤的最终编码坐标,total_num_proposals=batch_size*self._max_num_proposals
8) class_predictions_with_background: shape=[total_num_proposals, num_classes + 1] 每个盒子对类别的预测,total_num_proposals=batch_size*self._max_num_proposals.包含背景类别(at class index 0).
9) num_proposals: `self.max_num_proposals` .
10) proposal_boxes: shape=[batch_size, self.max_num_proposals, 4]使用绝对左边解码proposal_boxes.
11) mask_predictions: (optional) shape=[total_num_padded_proposals, num_classes, mask_height, mask_width]目标的掩码.
- postprocess(self, prediction_dict)
def postprocess(self, prediction_dict):
with tf.name_scope('FirstStagePostprocessor'):
image_shape = prediction_dict['image_shape']
if self._first_stage_only:
proposal_boxes, proposal_scores, num_proposals = self._postprocess_rpn(
prediction_dict['rpn_box_encodings'],
prediction_dict['rpn_objectness_predictions_with_background'],
prediction_dict['anchors'],
image_shape)
return {
'detection_boxes': proposal_boxes,
'detection_scores': proposal_scores,
'num_detections': tf.to_float(num_proposals)
}
with tf.name_scope('SecondStagePostprocessor'):
mask_predictions = prediction_dict.get(box_predictor.MASK_PREDICTIONS)
detections_dict = self._postprocess_box_classifier(
prediction_dict['refined_box_encodings'],
prediction_dict['class_predictions_with_background'],
prediction_dict['proposal_boxes'],
prediction_dict['num_proposals'],
image_shape,
mask_predictions=mask_predictions)
return detections_dict
这个方程把原始的预测输出传换成最终的检测结果,预测分数是基于logits的,first_stage_only=True返回时来自first stage RPN(每张图片self.max_num_proposals 个区域),否则结果来自two-stage(每张图片self._max_detections 个区域)结果是被转换成multiclass detections
Args:
prediction_dict: 是一个包含所有预测结果的字典。当first_stage_only=True,字典包含 (rpn_box_encodings
,rpn_objectness_predictions_with_background
, rpn_features_to_crop
, image_shape
, anchors
)否则在字典中还会有(refined_box_encodings
,class_predictions_with_background
, num_proposals
,proposal_boxes
, optionally, mask_predictions
)
Returns:
detections: a dictionary containing the following fields
detection_boxes: [batch, max_detection, 4],检测框的坐标
detection_scores: [batch, max_detections],检测款的分数
detection_classes: [batch, max_detections],检测框的类别
当(rpn_mode=False)时才会创建。
num_detections: [batch]
- loss(self, prediction_dict, scope=None)
def loss(self, prediction_dict, scope=None):
with tf.name_scope(scope, 'Loss', prediction_dict.values()):
(groundtruth_boxlists, groundtruth_classes_with_background_list,
groundtruth_masks_list
) = self._format_groundtruth_data(prediction_dict['image_shape'])
loss_dict = self._loss_rpn(
prediction_dict['rpn_box_encodings'],
prediction_dict['rpn_objectness_predictions_with_background'],
prediction_dict['anchors'],
groundtruth_boxlists,
groundtruth_classes_with_background_list)
if not self._first_stage_only:
loss_dict.update(
self._loss_box_classifier(
prediction_dict['refined_box_encodings'],
prediction_dict['class_predictions_with_background'],
prediction_dict['proposal_boxes'],
prediction_dict['num_proposals'],
groundtruth_boxlists,
groundtruth_classes_with_background_list,
prediction_dict['image_shape'],
prediction_dict.get('mask_predictions'),
groundtruth_masks_list,
))
return loss_dict
如果first_stage_only=True只计算(rpn_localization_loss
和rpn_objectness_loss
)的损失。否则计算所有的损失。
Args:
prediction_dict: 是一个包含所有预测结果的字典。当first_stage_only=True,字典包含 (rpn_box_encodings
,rpn_objectness_predictions_with_background
, rpn_features_to_crop
, image_shape
, anchors
)否则在字典中还会有(refined_box_encodings
,class_predictions_with_background
, num_proposals
,proposal_boxes
, optionally, mask_predictions
)
scope: 参数的空间
Returns:
一个字典包含(first_stage_localization_loss
,
first_stage_objectness_loss
, second_stage_localization_loss
,
second_stage_classification_loss
)
- restore_map(self, from_detection_checkpoint=True)
def restore_map(self, from_detection_checkpoint=True):
if not from_detection_checkpoint:
return self._feature_extractor.restore_from_classification_checkpoint_fn(
self.first_stage_feature_extractor_scope,
self.second_stage_feature_extractor_scope)
variables_to_restore = tf.global_variables()
variables_to_restore.append(slim.get_or_create_global_step())
# Only load feature extractor variables to be consistent with loading from
# a classification checkpoint.
feature_extractor_variables = tf.contrib.framework.filter_variables(
variables_to_restore,
include_patterns=[self.first_stage_feature_extractor_scope,
self.second_stage_feature_extractor_scope])
return {var.op.name: var for var in feature_extractor_variables}
从外部的检查点中导入参数。
Args:
from_detection_checkpoint: 是否导入完整的检测模型检查点或者从分类模型中导入检查点完成预训练的初始化
Returns:
所有从检查点恢复的参数的名字
网友评论