Learning Detection
yolo
darkflow 运行指南
https://github.com/thtrieu/darkflow.git
- 加载模型
用下载的weights初始化,注意cfg和weights要对上,否则会报错
flow --model cfg/tiny-yolo-voc.cfg --load bin/tiny-yolo-voc.weights
- 测试模型
flow --imgdir ~/dataset/yolo_test/ --model cfg/tiny-yolo-voc.cfg --load bin/tiny-yolo-voc.weights --gpu 1.0
yolo源码阅读
https://github.com/hizhangp/yolo_tensorflow
加载pascal数据集的标注
utils/pascal_voc.py
的load_pascal_annotation
函数
加载voc数据集中的标注信息, voc的标注信息是用xml文件存储的,xml的存储格式是树状结构。
root
<annotation>
<folder>VOC2012</folder>
<filename>2007_000027.jpg</filename>
<source></source>
<size></size>
<segmented>0</segmented>
<object></object>
</annotation>
object
<object>
<name>person</name>
<pose>Frontal</pose>
<truncated>1</truncated>
<difficult>0</difficult>
<bndbox>
<xmin>72</xmin>
<ymin>209</ymin>
<xmax>111</xmax>
<ymax>259</ymax>
</bndbox>
</object>
解析annotation的函数def load_pascal_annotation(self, index)
imname = os.path.join(self.data_path, 'JPEGImages', index+'.jpg')
im = cv2.imread(imname)
h_ratio = 1.0*self.image_size / im.shape[0] # self.image_size is (448,448)
w_ratio = 1.0*self.image_size / im.shape[1]
label = np.zeros((self.cell_size, self.cell_size, 25)) # 25 = 1 + 4 + 20
# 1 为confidence表示有没有物体, 4 为boxes, 20为对每个类的预测
# 只能预测20类, B=1的情况
import xml.etree.ElementTree as ET # 加载xml文件的解析器
filename = os.path.join(self.data_path, 'Annotations', index + '.xml') # 读取xml文件
tree = ET.parse(filename)
objs = tree.findall('object') # object是标注文件的一个支叉
for obj in objs:
bbx = obj.find('bndbox')
# Make pixel indexes 0-based
x1 = max(
min(
(float(bbox.find('xmin').text) - 1) * w_ratio,
self.image_size - 1
)
, 0) # fit bbox value to self.image_size (448*448)
y1 = max(min((float(bbox.find('ymin').text) - 1) * h_ratio, self.image_size - 1), 0)
x2 = max(min((float(bbox.find('xmax').text) - 1) * w_ratio, self.image_size - 1), 0)
y2 = max(min((float(bbox.find('ymax').text) - 1) * h_ratio, self.image_size - 1), 0)
cls_ind = self.class_to_ind[
obj.find('name'.text.lower().strip()) # object的分支name记录这该物体的class
] # 获得该物体所属类的
boxes = [(x2+x1)/2.0, (y2+y1)/2.0, x2-x1, y2-y1]
x_ind = int(boxes[0] * self.cell_size / self.image_size)
y_ind = int(boxes[1] * self.cell_size / self.image_size)
if label[y_ind, x_ind, 0] == 1:
continue # 因为B=1, 所以如果这个anchor已经物体了, 那么这个cell就不再预测了
# 否则比较这个物体的长宽比和哪个实现设定好的anchor长宽比更接近
lable[y_ind, x_ind, 0] = 1 # set confidence 1
lable[y_ind, x_ind, 1:5] = boxes
lable[y_ind, x_ind, 5 + cls_ind] = 1
return label, len(objs) # 返回label,和总共由多少个物体 ? 重复的那个(continue)不算了??
计算loss
yolo/yolo_net.py
中的loss_layer
函数
加载label时,只考虑了一个bbox,但是加载weights使用两个boxx
def loss_layer(self, predicts, label, scope=‘loss_layer’):
with tf.variable_scope(scope):
# def __init__(self, is_training=True):
# self.offset = np.transpose(
# np.reshape(
# np.array([np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell),
# (self.boexs_per_cell, self.cell_size, self.cell_size))
# , (1,2,0) ))
...
offset = tf.reshape(tf.constant(self.offset, dtype=tf.float32),
[1,self.cell_size, self.cell_size,self.boxes_per_cell])
offset = tf.tile(offset, [self.batch_size, 1,1,1]) # repeat for batch for x coordinate
offset_tran = tf.transpose(offset, (0,2,1,3)) # offset for y coordinate
# 转换到相对于图像原点的坐标,网络的输出是相对于cell的, 为了计算iou
# 具体来讲,0.9这个值如果是相对于图像原点,那么表示的方框中心一定位于最后一个cell, 而如果相对于cell的话,它表示的方框中心位置与这个cell本身的位置有关。
predict_boxes_tran = tf.stack(
[(predicted_boxes[..., 0] + offset) / self.cell_size,
(predicted_boxes[..., 0] + offset) / self.cell_size,
tf.square(predict_boxes[...,2]), # 计算iou时要平方!
tf.square(predict_boxes[...,3])], axis=-1)
)
# 计算iou使用的是相对于图像原点的坐标
iou_predict_truth = self.calc_iou(predict_boxes_tran, boxes)
object_mask = tf.reduce_max(iou_predict_truth, 3, keep_dim=True) # 从两个box预测的iou中挑出最大的那个
object_mask = tf.cast((iou_predict_truth >= object_mask), tf.float32) * response
# response表示这个cell是否有物体。object_mask表示这个cell的这个box是否负责预测
# 前面那个iou_predict_truth >= object_mask部分是为了找对对应的box
# 将负责预测的box设置为1,不起作用的box设置为0。
# 最后得到的tensor的shape=(bs,7,7,2) 进行预测的那个cell的box那个位置为1
noobject_mask = tf.ones_like(object_mask, dtype=tf.float32) - object_mask # 对object_mask取反
# 转换到相对于cell的坐标,label读入是相对于图像原点的,为了计算loss
boxes_tran = tf.stack(
[boxes[...,0]*self.cell_size - offset,
[boxes[...,1]*self.cell_size - offset_tran,
tf.sqrt(boxes[..., 2]), # 计算loss时要开根号!
tf.sqrt(boxes[..., 3])], axis=-1)
)
# class_loss
class_delta = response*(predict_classes - classes)
class_loss = tf.reduce_mean(tf.reduce_sum(class_delta, axis=[1,2,3]), name='class_loss') * self.class_scale # 2
# object_loss -- confidence 是否有物体predict_scales为0/1
object_delta = object_mask*(predict_scales - iou_predict_truth)
object_loss = tf.reduce_mean(tf.reduce_sum(object_delta, axis=[1,2,3]), name='object_loss') * self.object_scale # 1
# noobject_loss
noobject_delta = noobject_mask * predict_scales
noobject_loss = tf.reduce_mean(tf.reduce_mean(tf.square(noobject_mask), axis=[1,2,3]), name='noobject_loss') * self.noobject_scale # 1
# coord_loss
coord_mask = tf.expand_dims(object_mask, 4) # (bs,7,7,2) to (bs,7,7,2,1)
boxes_delta = coord_mask * (predict_boxes-boxes_tran) # (bs.7.7.2.4)
coord_loss = tf.reduce_mean(tf.reduce_sum(tf.square(boxes_delta), axis=[1,2,3,4]), name='coord_loss') * self.coord_scale # 5
计算iou
yolo/yolo_net.py
中的calc_iou
函数
"""calculate ious
Args:
boxes1: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ====> (x_center, y_center, w, h)
boxes2: 5-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL, 4] ===> (x_center, y_center, w, h)
Return:
iou: 4-D tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
"""
def calc_iou(self, predicts_boxes, boxes, scope='iou'):
with tf.variable_scope(scope):
# convert (x_center, y_center, w, h) to (x1,y1,x2,y2)
predicts_boxes_t = tf.stack([predicts_boxes[..., 0] - predicts_boxes[.., 2] / 2.0, # x1=(x_center-w)/2
predicts_boxes[..., 1] - predicts_boxes[.., 3] / 2.0, # y1=(y_center-h)/2
predicts_boxes[..., 0] + predicts_boxes[.., 2] / 2.0, # x2=(x_center+w)/2
predicts_boxes[..., 1] - predicts_boxes[.., 3] / 2.0,], # y2=(y_center+h)/2
axis=-1)
boxes_t = tf.stack([boxes[..., 0] -boxes[.., 2] / 2.0, # x1=(x_center-w)/2
boxes[..., 1] - boxes[.., 3] / 2.0, # y1=(y_center-h)/2
boxes[..., 0] + boxes[.., 2] / 2.0, # x2=(x_center+w)/2
boxes[..., 1] - boxes[.., 3] / 2.0,], # y2=(y_center+h)/2
axis=-1)
# calcylate the left up point & right down point boxes 是label,两个box的值一样 (tf.tile)
lu = tf.maximum(predicts_boxes_t[...,:2], boxes_t[...,:2]) # get maximum of (x1,y1) for each (bs,7,7,2,2)
rb = tf.minimum(predicts_boxes_t[...,:2], boxes_t[...,:2]) # get minimum of (x2,y2) for each (bs,7,7,2,2)
# intersection
intersection = tf.maximum(0.0, rb - lu) # width and height of intersection (bs,7,7,2,2)
inter_square = intersection[...,0] * intersection[...,1] # width*height (bs,7,7,2,1)
# caclulate union_square
union_square = predicts_boxes[...,2]*predicts_boxes[...,3] + boxes[...,2]*boxes[...,3]
union_square = tf.maximum(union_square, 1e-10)
return tf.clip_by_value(inter_square/union_square, 0.0, 1.0)
解析网络输出
test.py
中interpret_output
函数
def interpret_output(self, output):
probs = np.zeros((self.cell_size, self.cell_size, self.boxes_per_cell, self.num_class)) # container for prob of each cell, box, class
class_probs = np.reshape(output[:self.boundary1],
(self.cell_size, self.cell_size, self.num_class))
scales = np.reshape(output[self.boundary1:self.boundary2],
(self.cell_size, self.cell_size, self.boxes_per_cell))
boxes = np.reshape(output[self.boundary2:],
(self.cell_size, self.cell_size, self.boxes_per_cell, 4))
offset = np.array([np.array(self.cell_size)]*self.cell_size*self.boxes_per_cell)
offset = np.transpose(np.reshape(offset, [self.boxes_per_cell, self.cell_size, self.cell_size]), (1,2,0))
# convert too image origin base
boxes[:,:,:0]+=offset
boxes[:,:,:,1]+=np.transpose(offset, (1,0,2))
boxes[:,:,:,:2] = 1.0*boxes[:,:,:,0:2] / self.cell_size
boxes[:,:,:,2:] = np.square(boxes[:,:,:,2:])
boxes *= self.image_size # 448
for i in range(self.boxes_per_cell):
for j in range(self.num_class):
probs[:,:,i,j] = class_probs[:,:,j]*scales[:,:,i]
filter_mat_probs = np.array(probs >= self.threshold, dtype='bool') # prob big enough for each cell, box, class (7,7,2,20)
probs_filtered = probs[filter_mat_probs] # (7,7,2,20) ->(cell_x,cell_y,box_ind,class_ind) -> (n,1)
filter_mat_boxes = np.nonzero(filter_mat_probs) # index in tuple of 4 np.array (cell_x, cell_y, box_ind, class_ind)
boxes_filtered = boxes[filter_mat_boxes[0], filter_mat_boxes[1], filter_mat_boxes[2]] # (7,7,2,4) -> (cell_x, cell_y, box_ind) -> (3,4)
classes_num_filtered = np.argmax(probs, axis=3)[ # (7,7,2,1)
# correct filter_mat_probs to probs, if there are multiple class prob large than threshold, only take the first
# but it still have same bug, if largest prob are same, it my raise error
filter_mat_boxes[0], filter_mat_boxes[1], filter_mat_boxes[2]] # (7,7,2,1) -> (cell_x, cell_y, box_ind) -> (3,1) # class_index
# Non maximum suppression
for i in range(len(boxes_filtered)):
if probs_filtered[i] == 0:
continue
for j in range(i + 1, len(boxes_filtered)):
if self.iou(boxes_filtered[i], boxes_filtered[j]) > self.iou_threshold: # 0.5
probs_filtered[j] = 0.0 # nms, set low confidence prob to 0, because of overlap
filter_iou = np.array(probs_filtered > 0.0, dtype='bool') # index
boxes_filtered = boxes_filtered[filter_iou]
probs_filtered = probs_filtered[filter_iou]
classes_num_filtered = classes_num_filtered[filter_iou]
result = []
for i in range(len(boxes_filtered)):
result.append(
[self.classes[classes_num_filtered[i]],
boxes_filtered[i][0],
boxes_filtered[i][1],
boxes_filtered[i][2],
boxes_filtered[i][3],
probs_filtered[i]])
return result
yolov2
参考源码
https://github.com/Yinpenguin/Yolo-pytorch
检测网络结构
先用论文Table6中Darknet-19在ImageNet进行预训练,Darknet-19处理一张图需要5.58billion运算,vgg16处理一张图片需要8.52billion次运算。
-
yolov2检测网络的网络结构
image.png
值得注意的是reorg(27)层和route(25,28)层。首先利用Resnet的思想,route(25)为16层的残差,然后接一层pointwise conv将channel从256映射到64,再通过reorg层将 26x26 的空间信息reorganize成 13x13 到深度上,再采用GoogLenet思想,将reorg层的输出与卷积输出channelwise concatenate。
- reorg层的pytorch实现
def forward(self, input):
output = self.stage1_conv1(input)
output = self.stage1_conv2(output)
output = self.stage1_conv3(output)
output = self.stage1_conv4(output)
output = self.stage1_conv5(output)
output = self.stage1_conv6(output)
output = self.stage1_conv7(output)
output = self.stage1_conv8(output)
output = self.stage1_conv9(output)
output = self.stage1_conv10(output)
output = self.stage1_conv11(output)
output = self.stage1_conv12(output)
output = self.stage1_conv13(output)
residual = output
output_1 = self.stage2_a_maxpl(output)
output_1 = self.stage2_a_conv1(output_1)
output_1 = self.stage2_a_conv2(output_1)
output_1 = self.stage2_a_conv3(output_1)
output_1 = self.stage2_a_conv4(output_1)
output_1 = self.stage2_a_conv5(output_1)
output_1 = self.stage2_a_conv6(output_1)
output_1 = self.stage2_a_conv7(output_1)
# layer 25 route
output_2 = self.stage2_b_conv(residual)
# layer 27 reorg
batch_size, num_channel, height, width = output_2.data.size()
output_2 = output_2.view(batch_size, int(num_channel / 4), height, 2, width, 2).contiguous()
output_2 = output_2.permute(0, 3, 5, 1, 2, 4).contiguous()
output_2 = output_2.view(batch_size, -1, int(height / 2), int(width / 2))
# layer 28 route
output = torch.cat((output_1, output_2), 1)
output = self.stage3_conv1(output)
output = self.stage3_conv2(output)
return output
- reorg层剖析
。该博客说reorg层可以用tf.space_to_depth
实现。
image.png
但是我参考了作者C源码
void reorg_cpu(float *x, int w, int h, int c, int n, int stride, int forward, float *out)
{
int out_c = c/(stride*stride);
for(int ni = 0; ni < n; ++ni){
for(int ci = 0; ci < c; ++ci){
for(int hi = 0; hi < h; ++hi){
for(int wi = 0; wi < w; ++wi){
int in_index = wi + w * (hi + h * (ci + c * ni));
int c2 = ci % out_c;
int offset = ci / out_c;
int h2 = hi * stride + offset / stride; // 这里虽然stride为2,但是效果不是简单的stride为2
int w2 = wi * stride + offset % stride;
int out_index = w2 + w * stride * (h2 + h * stride * (c2 + out_c * ni));
if(forward) out[out_index] = x[in_index];
else out[in_index] = x[out_index]; // 使用的这行
}
}
}
}
}
实时上水平方向是以stride为2进行采样,竖直方向以两行一组采样,stride也为两行,即stride为4。
Dataset
training_set = VOCDataset(opt.data_path, opt.year, opt.train_set, opt.image_size)
training_generator = DataLoader(training_set, **training_params)
在加载label时将从xml文件中读取的[xmin,ymin,xmax,ymax]转化为了[xmin,ymin,width,height]。
# VOCDataset 继承了torch.utils.data.Dataset 实现了__getitem__函数
def __getitem__(self, item):
annot = ET.parse(image_xml_path)
objects = []
for obj in annot.findall('object'):
xmin, xmax, ymin, ymax = [int(obj.find('bndbox').find(tag).text) - 1 for tag in
["xmin", "xmax", "ymin", "ymax"]
label = self.classes.index(obj.find('name').text.lower().strip())
objects.append([xmin, ymin, xmax, ymax, label])
# convert [xmin,ymin,xmax,ymax] to [xmin,ymin,width,height] in Resize
if self.is_training:
transformations = Compose([HSVAdjust(), VerticalFlip(), Crop(), Resize(self.image_size)])
else:
transformations = Compose([Resize(self.image_size)])
image, objects = transformations((image, objects))
anchor
使用k-mean对数据集所有框长和宽进行聚类。但是没有使用欧式距离,因为欧氏距离大的box距离大,而是使用。
k-mean得到的5个anchor,大小是相对于13x13 feature map的。
anchors=[(1.3221, 1.73145), (3.19275, 4.00944), (5.05587, 8.09892),
(9.47112, 4.84053),(11.2364, 10.0071)]
iou
def bbox_ious(boxes1, boxes2): # boxes1 is gt
b1x1, b1y1 = (boxes1[:, :2] - (boxes1[:, 2:4] / 2)).split(1, 1)
b1x2, b1y2 = (boxes1[:, :2] + (boxes1[:, 2:4] / 2)).split(1, 1)
b2x1, b2y1 = (boxes2[:, :2] - (boxes2[:, 2:4] / 2)).split(1, 1)
b2x2, b2y2 = (boxes2[:, :2] + (boxes2[:, 2:4] / 2)).split(1, 1)
# find min and max for each pair(num_obj, height*width*num_anchors) or (num_obj, num_anchors)
dx = (b1x2.min(b2x2.t()) - b1x1.max(b2x1.t())).clamp(min=0) # clamp to insure value greater than 0
dy = (b1y2.min(b2y2.t()) - b1y1.max(b2y1.t())).clamp(min=0)
intersections = dx * dy
areas1 = (b1x2 - b1x1) * (b1y2 - b1y1)
areas2 = (b2x2 - b2x1) * (b2y2 - b2y1)
unions = (areas1 + areas2.t()) - intersections
return intersections / unions
计算所有box(heightwidthnum_anchors)和gt的IOU。
# Set confidence mask of matching detections to 0
# size of cur_pred_boxes's anchor after regression
iou_gt_pred = bbox_ious(gt, cur_pred_boxes) # (num_obj,4) (height*width*num_anchors,4) -> (num_obj,height*width*num_anchors) : iou for each (gt, pred) pair
mask = (iou_gt_pred > self.thresh).sum(0) >= 1 # get obj pos on pred (height*width*num_anchors)
conf_mask[b][mask.view_as(conf_mask[b])] = 0 # [num_anchors,width*height]
计算所有anchor和gt的iou,从中找出最合适的anchor
iou_gt_anchors = bbox_ious(gt_wh, anchors) # (num_obj,4) (num_anchors,4) -> (num_obj,num_anchors) : iou for each (gt, anchors) pair
_, best_anchors = iou_gt_anchors.max(1) # best_anchors (num_obj, 1) index of best iou, _ is value of best iou
网友评论