美文网首页
Learning Detection

Learning Detection

作者: 王拓 | 来源:发表于2019-02-16 16:37 被阅读0次

Learning Detection

yolo

darkflow 运行指南

https://github.com/thtrieu/darkflow.git

  • 加载模型
    用下载的weights初始化,注意cfg和weights要对上,否则会报错
flow --model cfg/tiny-yolo-voc.cfg --load bin/tiny-yolo-voc.weights
  • 测试模型
flow --imgdir ~/dataset/yolo_test/ --model cfg/tiny-yolo-voc.cfg --load bin/tiny-yolo-voc.weights  --gpu 1.0

yolo源码阅读

https://github.com/hizhangp/yolo_tensorflow

加载pascal数据集的标注

utils/pascal_voc.pyload_pascal_annotation函数
加载voc数据集中的标注信息, voc的标注信息是用xml文件存储的,xml的存储格式是树状结构。
root

<annotation>
  <folder>VOC2012</folder>
  <filename>2007_000027.jpg</filename>
  <source></source>  
  <size></size>
  <segmented>0</segmented>
  <object></object>
</annotation>

object

<object>
  <name>person</name>
  <pose>Frontal</pose>
  <truncated>1</truncated>
  <difficult>0</difficult>
  <bndbox>
    <xmin>72</xmin>
    <ymin>209</ymin>  
    <xmax>111</xmax>
    <ymax>259</ymax>
  </bndbox>
</object>

解析annotation的函数def load_pascal_annotation(self, index)

imname = os.path.join(self.data_path, 'JPEGImages', index+'.jpg')
im = cv2.imread(imname)
h_ratio = 1.0*self.image_size / im.shape[0] # self.image_size is (448,448)
w_ratio = 1.0*self.image_size / im.shape[1]

label = np.zeros((self.cell_size, self.cell_size, 25)) # 25 = 1 + 4 + 20
# 1 为confidence表示有没有物体, 4 为boxes, 20为对每个类的预测
# 只能预测20类, B=1的情况

import xml.etree.ElementTree as ET # 加载xml文件的解析器
filename = os.path.join(self.data_path, 'Annotations', index + '.xml') #  读取xml文件
tree = ET.parse(filename)
objs = tree.findall('object') # object是标注文件的一个支叉

for obj in objs:
  bbx = obj.find('bndbox')
  # Make pixel indexes 0-based
  x1 = max(
             min(
                  (float(bbox.find('xmin').text) - 1) * w_ratio,
                   self.image_size - 1
             )
             , 0) # fit bbox value to self.image_size (448*448)
  y1 = max(min((float(bbox.find('ymin').text) - 1) * h_ratio, self.image_size - 1), 0)
  x2 = max(min((float(bbox.find('xmax').text) - 1) * w_ratio, self.image_size - 1), 0)
  y2 = max(min((float(bbox.find('ymax').text) - 1) * h_ratio, self.image_size - 1), 0)
  cls_ind = self.class_to_ind[
    obj.find('name'.text.lower().strip()) # object的分支name记录这该物体的class
] # 获得该物体所属类的
  boxes = [(x2+x1)/2.0, (y2+y1)/2.0, x2-x1, y2-y1]
  x_ind = int(boxes[0] * self.cell_size / self.image_size) 
  y_ind = int(boxes[1] * self.cell_size / self.image_size) 
  if label[y_ind, x_ind, 0] == 1:
    continue  # 因为B=1, 所以如果这个anchor已经物体了, 那么这个cell就不再预测了
    # 否则比较这个物体的长宽比和哪个实现设定好的anchor长宽比更接近
  lable[y_ind, x_ind, 0] = 1 # set confidence 1
  lable[y_ind, x_ind, 1:5] = boxes
  lable[y_ind, x_ind, 5 + cls_ind] = 1  
return label, len(objs) # 返回label,和总共由多少个物体 ? 重复的那个(continue)不算了??

计算loss

yolo/yolo_net.py中的loss_layer函数
加载label时,只考虑了一个bbox,但是加载weights使用两个boxx

def loss_layer(self, predicts, label, scope=‘loss_layer’):
  with tf.variable_scope(scope):
    # def __init__(self, is_training=True):
    # self.offset = np.transpose(
    # np.reshape(
    # np.array([np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell), 
    # (self.boexs_per_cell, self.cell_size, self.cell_size))
    # , (1,2,0) ))
    ...    
    offset = tf.reshape(tf.constant(self.offset, dtype=tf.float32),
      [1,self.cell_size, self.cell_size,self.boxes_per_cell])
    offset = tf.tile(offset, [self.batch_size, 1,1,1]) # repeat for batch for x coordinate
    offset_tran = tf.transpose(offset, (0,2,1,3)) # offset for y coordinate
    # 转换到相对于图像原点的坐标,网络的输出是相对于cell的, 为了计算iou
    # 具体来讲,0.9这个值如果是相对于图像原点,那么表示的方框中心一定位于最后一个cell, 而如果相对于cell的话,它表示的方框中心位置与这个cell本身的位置有关。
    predict_boxes_tran = tf.stack( 
      [(predicted_boxes[..., 0] + offset) / self.cell_size,
        (predicted_boxes[..., 0] + offset) / self.cell_size,
        tf.square(predict_boxes[...,2]), # 计算iou时要平方!
        tf.square(predict_boxes[...,3])], axis=-1)
      )
    # 计算iou使用的是相对于图像原点的坐标
    iou_predict_truth = self.calc_iou(predict_boxes_tran, boxes)
    
    object_mask = tf.reduce_max(iou_predict_truth, 3, keep_dim=True) # 从两个box预测的iou中挑出最大的那个
    object_mask = tf.cast((iou_predict_truth >= object_mask), tf.float32) * response 
    # response表示这个cell是否有物体。object_mask表示这个cell的这个box是否负责预测
    # 前面那个iou_predict_truth >= object_mask部分是为了找对对应的box
    # 将负责预测的box设置为1,不起作用的box设置为0。
    # 最后得到的tensor的shape=(bs,7,7,2) 进行预测的那个cell的box那个位置为1
     noobject_mask = tf.ones_like(object_mask, dtype=tf.float32) - object_mask # 对object_mask取反
     # 转换到相对于cell的坐标,label读入是相对于图像原点的,为了计算loss
     boxes_tran = tf.stack(
        [boxes[...,0]*self.cell_size - offset,
        [boxes[...,1]*self.cell_size - offset_tran,
        tf.sqrt(boxes[..., 2]),  # 计算loss时要开根号!
        tf.sqrt(boxes[..., 3])], axis=-1)
      )
      # class_loss
      class_delta = response*(predict_classes - classes)
      class_loss  = tf.reduce_mean(tf.reduce_sum(class_delta, axis=[1,2,3]), name='class_loss') * self.class_scale # 2
      # object_loss -- confidence 是否有物体predict_scales为0/1
      object_delta = object_mask*(predict_scales - iou_predict_truth)
      object_loss = tf.reduce_mean(tf.reduce_sum(object_delta, axis=[1,2,3]), name='object_loss') * self.object_scale # 1
      # noobject_loss
      noobject_delta = noobject_mask * predict_scales 
      noobject_loss = tf.reduce_mean(tf.reduce_mean(tf.square(noobject_mask), axis=[1,2,3]), name='noobject_loss') * self.noobject_scale # 1
      # coord_loss
      coord_mask = tf.expand_dims(object_mask, 4) # (bs,7,7,2) to (bs,7,7,2,1)
      boxes_delta = coord_mask * (predict_boxes-boxes_tran) # (bs.7.7.2.4)
      coord_loss = tf.reduce_mean(tf.reduce_sum(tf.square(boxes_delta), axis=[1,2,3,4]), name='coord_loss') * self.coord_scale # 5

计算iou

yolo/yolo_net.py中的calc_iou函数

"""calculate ious
Args:
  boxes1: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4]  ====> (x_center, y_center, w, h)
  boxes2: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ===> (x_center, y_center, w, h)
Return:
  iou: 4-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
"""
def calc_iou(self, predicts_boxes, boxes, scope='iou'):
   with tf.variable_scope(scope):
    # convert (x_center, y_center, w, h) to (x1,y1,x2,y2)
    predicts_boxes_t  = tf.stack([predicts_boxes[..., 0] - predicts_boxes[.., 2] / 2.0, # x1=(x_center-w)/2
                                              predicts_boxes[..., 1] - predicts_boxes[.., 3] / 2.0, # y1=(y_center-h)/2
                                              predicts_boxes[..., 0] + predicts_boxes[.., 2] / 2.0, # x2=(x_center+w)/2
                                              predicts_boxes[..., 1] - predicts_boxes[.., 3] / 2.0,], # y2=(y_center+h)/2
                                              axis=-1)
    boxes_t  = tf.stack([boxes[..., 0] -boxes[.., 2] / 2.0, # x1=(x_center-w)/2
                                              boxes[..., 1] - boxes[.., 3] / 2.0, # y1=(y_center-h)/2
                                              boxes[..., 0] + boxes[.., 2] / 2.0, # x2=(x_center+w)/2
                                              boxes[..., 1] - boxes[.., 3] / 2.0,], # y2=(y_center+h)/2
                                              axis=-1)
     # calcylate the left up point & right down point  boxes 是label,两个box的值一样 (tf.tile)
     lu = tf.maximum(predicts_boxes_t[...,:2], boxes_t[...,:2]) # get maximum of (x1,y1) for each (bs,7,7,2,2)
     rb = tf.minimum(predicts_boxes_t[...,:2], boxes_t[...,:2]) # get minimum of (x2,y2) for each (bs,7,7,2,2)
     # intersection 
     intersection = tf.maximum(0.0,  rb - lu) # width and height of intersection (bs,7,7,2,2)
     inter_square = intersection[...,0] * intersection[...,1] # width*height (bs,7,7,2,1)
     # caclulate union_square
     union_square = predicts_boxes[...,2]*predicts_boxes[...,3] + boxes[...,2]*boxes[...,3]
     union_square = tf.maximum(union_square, 1e-10)
  return tf.clip_by_value(inter_square/union_square, 0.0, 1.0)

解析网络输出

test.pyinterpret_output函数

def interpret_output(self, output):
  probs = np.zeros((self.cell_size, self.cell_size, self.boxes_per_cell, self.num_class)) # container for prob of each cell, box, class
  class_probs = np.reshape(output[:self.boundary1], 
      (self.cell_size, self.cell_size, self.num_class))
  scales = np.reshape(output[self.boundary1:self.boundary2],
      (self.cell_size, self.cell_size, self.boxes_per_cell))
  boxes = np.reshape(output[self.boundary2:], 
      (self.cell_size, self.cell_size, self.boxes_per_cell, 4))

  offset = np.array([np.array(self.cell_size)]*self.cell_size*self.boxes_per_cell)
  offset = np.transpose(np.reshape(offset, [self.boxes_per_cell, self.cell_size, self.cell_size]), (1,2,0))

  # convert too image origin base
  boxes[:,:,:0]+=offset
  boxes[:,:,:,1]+=np.transpose(offset, (1,0,2))
  boxes[:,:,:,:2] = 1.0*boxes[:,:,:,0:2] / self.cell_size
  boxes[:,:,:,2:] = np.square(boxes[:,:,:,2:])
  
  boxes *= self.image_size # 448
  
  for i in range(self.boxes_per_cell):
    for j in range(self.num_class):
      probs[:,:,i,j] = class_probs[:,:,j]*scales[:,:,i]
  
  filter_mat_probs = np.array(probs >= self.threshold, dtype='bool') # prob   big enough for each cell, box, class (7,7,2,20)
  probs_filtered = probs[filter_mat_probs] # (7,7,2,20) ->(cell_x,cell_y,box_ind,class_ind) -> (n,1)

  filter_mat_boxes = np.nonzero(filter_mat_probs) # index in tuple of 4 np.array (cell_x, cell_y, box_ind, class_ind)
  boxes_filtered = boxes[filter_mat_boxes[0], filter_mat_boxes[1], filter_mat_boxes[2]] # (7,7,2,4) -> (cell_x, cell_y, box_ind) -> (3,4) 
   
  classes_num_filtered = np.argmax(probs, axis=3)[ # (7,7,2,1)
  # correct filter_mat_probs to probs, if there are multiple class prob large than threshold, only take the first
  # but it still have same bug, if largest prob are same, it my raise error
  filter_mat_boxes[0], filter_mat_boxes[1], filter_mat_boxes[2]] # (7,7,2,1) -> (cell_x, cell_y, box_ind) -> (3,1) # class_index
  # Non maximum suppression
  for i in range(len(boxes_filtered)):
    if probs_filtered[i] == 0:
      continue
    for j in range(i + 1, len(boxes_filtered)):
      if self.iou(boxes_filtered[i], boxes_filtered[j]) > self.iou_threshold: # 0.5
         probs_filtered[j] = 0.0 # nms, set low confidence prob to 0, because of overlap

  filter_iou = np.array(probs_filtered > 0.0, dtype='bool') # index 
  boxes_filtered = boxes_filtered[filter_iou]
  probs_filtered = probs_filtered[filter_iou]
  classes_num_filtered = classes_num_filtered[filter_iou]

  result = []
  for i in range(len(boxes_filtered)):
    result.append(
           [self.classes[classes_num_filtered[i]],
           boxes_filtered[i][0],
           boxes_filtered[i][1],
           boxes_filtered[i][2],
           boxes_filtered[i][3],
           probs_filtered[i]])
  return result

yolov2

参考源码
https://github.com/Yinpenguin/Yolo-pytorch

检测网络结构

先用论文Table6中Darknet-19在ImageNet进行预训练,Darknet-19处理一张图需要5.58billion运算,vgg16处理一张图片需要8.52billion次运算。

  • yolov2检测网络的网络结构


    image.png

    值得注意的是reorg(27)层和route(25,28)层。首先利用Resnet的思想,route(25)为16层的残差,然后接一层pointwise conv将channel从256映射到64,再通过reorg层将 26x26 的空间信息reorganize成 13x13 到深度上,再采用GoogLenet思想,将reorg层的输出与卷积输出channelwise concatenate。

  • reorg层的pytorch实现
    def forward(self, input):
        output = self.stage1_conv1(input)
        output = self.stage1_conv2(output)
        output = self.stage1_conv3(output)
        output = self.stage1_conv4(output)
        output = self.stage1_conv5(output)
        output = self.stage1_conv6(output)
        output = self.stage1_conv7(output)
        output = self.stage1_conv8(output)
        output = self.stage1_conv9(output)
        output = self.stage1_conv10(output)
        output = self.stage1_conv11(output)
        output = self.stage1_conv12(output)
        output = self.stage1_conv13(output)

        residual = output 

        output_1 = self.stage2_a_maxpl(output)
        output_1 = self.stage2_a_conv1(output_1)
        output_1 = self.stage2_a_conv2(output_1)
        output_1 = self.stage2_a_conv3(output_1)
        output_1 = self.stage2_a_conv4(output_1)
        output_1 = self.stage2_a_conv5(output_1)
        output_1 = self.stage2_a_conv6(output_1)
        output_1 = self.stage2_a_conv7(output_1)

    # layer 25 route
        output_2 = self.stage2_b_conv(residual) 
    # layer 27 reorg 
        batch_size, num_channel, height, width = output_2.data.size()
        output_2 = output_2.view(batch_size, int(num_channel / 4), height, 2, width, 2).contiguous()
        output_2 = output_2.permute(0, 3, 5, 1, 2, 4).contiguous()
        output_2 = output_2.view(batch_size, -1, int(height / 2), int(width / 2))
    # layer 28 route
        output = torch.cat((output_1, output_2), 1) 
        output = self.stage3_conv1(output)
        output = self.stage3_conv2(output)

        return output
  • reorg层剖析
    。该博客说reorg层可以用tf.space_to_depth实现。
    image.png

但是我参考了作者C源码

  void reorg_cpu(float *x, int w, int h, int c, int n, int stride, int forward, float *out)
  {
      int out_c = c/(stride*stride);
  ​
      for(int ni = 0; ni < n; ++ni){
          for(int ci = 0; ci < c; ++ci){
              for(int hi = 0; hi < h; ++hi){
                  for(int wi = 0; wi < w; ++wi){
                      int in_index  = wi + w * (hi + h * (ci + c * ni));
                      int c2 = ci % out_c;
                      int offset = ci / out_c;
                      int h2 = hi * stride + offset / stride; // 这里虽然stride为2,但是效果不是简单的stride为2
                      int w2 = wi * stride + offset % stride;
                      int out_index = w2 + w * stride * (h2 + h * stride * (c2 + out_c * ni));
                      if(forward) out[out_index] = x[in_index];
                      else out[in_index] = x[out_index]; // 使用的这行
                  }
              }
          }
      }
  }

实时上水平方向是以stride为2进行采样,竖直方向以两行一组采样,stride也为两行,即stride为4。

Dataset

training_set = VOCDataset(opt.data_path, opt.year, opt.train_set, opt.image_size) 
training_generator = DataLoader(training_set, **training_params)

在加载label时将从xml文件中读取的[xmin,ymin,xmax,ymax]转化为了[xmin,ymin,width,height]。

# VOCDataset 继承了torch.utils.data.Dataset 实现了__getitem__函数
def __getitem__(self, item):
   annot = ET.parse(image_xml_path)
   objects = []
   for obj in annot.findall('object'):
      xmin, xmax, ymin, ymax = [int(obj.find('bndbox').find(tag).text) - 1 for tag in
                                      ["xmin", "xmax", "ymin", "ymax"]
      label = self.classes.index(obj.find('name').text.lower().strip())
      objects.append([xmin, ymin, xmax, ymax, label])
      # convert [xmin,ymin,xmax,ymax] to [xmin,ymin,width,height] in Resize
   if self.is_training:
       transformations = Compose([HSVAdjust(), VerticalFlip(), Crop(), Resize(self.image_size)]) 
   else:
       transformations = Compose([Resize(self.image_size)])
   image, objects = transformations((image, objects))

anchor

使用k-mean对数据集所有框长和宽进行聚类。但是没有使用欧式距离,因为欧氏距离大的box距离大,而是使用d(box, centroid) = 1-\text{IOU}(box,centroid)
k-mean得到的5个anchor,大小是相对于13x13 feature map的。

anchors=[(1.3221, 1.73145), (3.19275, 4.00944), (5.05587, 8.09892), 
(9.47112, 4.84053),(11.2364, 10.0071)]

iou

def bbox_ious(boxes1, boxes2): # boxes1 is gt
    b1x1, b1y1 = (boxes1[:, :2] - (boxes1[:, 2:4] / 2)).split(1, 1) 
    b1x2, b1y2 = (boxes1[:, :2] + (boxes1[:, 2:4] / 2)).split(1, 1)
    b2x1, b2y1 = (boxes2[:, :2] - (boxes2[:, 2:4] / 2)).split(1, 1) 
    b2x2, b2y2 = (boxes2[:, :2] + (boxes2[:, 2:4] / 2)).split(1, 1)


    # find min and max for each pair(num_obj, height*width*num_anchors) or (num_obj, num_anchors)
    dx = (b1x2.min(b2x2.t()) - b1x1.max(b2x1.t())).clamp(min=0) # clamp to insure value greater than 0
    dy = (b1y2.min(b2y2.t()) - b1y1.max(b2y1.t())).clamp(min=0)
    intersections = dx * dy

    areas1 = (b1x2 - b1x1) * (b1y2 - b1y1)
    areas2 = (b2x2 - b2x1) * (b2y2 - b2y1)
    unions = (areas1 + areas2.t()) - intersections

    return intersections / unions

计算所有box(heightwidthnum_anchors)和gt的IOU。

# Set confidence mask of matching detections to 0
# size of cur_pred_boxes's anchor after regression
iou_gt_pred = bbox_ious(gt, cur_pred_boxes) # (num_obj,4) (height*width*num_anchors,4) -> (num_obj,height*width*num_anchors) : iou for each (gt, pred) pair
mask = (iou_gt_pred > self.thresh).sum(0) >= 1 # get obj pos on pred (height*width*num_anchors) 
conf_mask[b][mask.view_as(conf_mask[b])] = 0 # [num_anchors,width*height]  

计算所有anchor和gt的iou,从中找出最合适的anchor

iou_gt_anchors = bbox_ious(gt_wh, anchors) # (num_obj,4) (num_anchors,4) -> (num_obj,num_anchors) : iou for each (gt, anchors) pair
 _, best_anchors = iou_gt_anchors.max(1) # best_anchors (num_obj, 1) index of best iou, _ is value of best iou

相关文章

网友评论

      本文标题:Learning Detection

      本文链接:https://www.haomeiwen.com/subject/zowqjqtx.html