美文网首页
deeplabv3+ 算法

deeplabv3+ 算法

作者: LuDon | 来源:发表于2018-12-03 15:35 被阅读189次

deeplab

Encoder-Decoder with Atrous convolution(空洞卷积)

空洞卷积能够通过深度卷积神经网络控制特征的像素,调节卷积核的视觉域以获取多尺度的信息。对于两维的信号,有
y[i] = \sum x[i+r.k]w[k]
其中空洞率r是采样输入信号的步长.

深度可分离卷积是depthwise conv(每个输入通道的单独的空间卷积)加上pointwise(结合depthwise的输出)。

Deeplabv3 as encoder用Logit前面一个特征作为encoder的输出,这个特征包括256个通道和丰富的语义特征。

decoder:encoder的输出首先经过因子为4的双线性差值上采样,然后与相应的低级特征连接,之后经过几个3 \times 3的卷积层。

Atrous Spatial Pyramid Pooling

有四个平行的不同的空洞卷积相连,不同的空洞率获取不同尺度的信息。

ASPP包括两个部分:多尺度空洞卷积和图像级特征。多尺度空洞卷积包括,1 \times 1的普通卷积,3 \times 3空洞率为6的空洞卷积,3 \times 3空洞率为12的空洞卷积,3 \times 3空洞率为18的空洞卷积;图像级特征,对输入在[1,2]维上求均值,经过1 \times 1的普通卷积,再使用线性差值resize到输入图像的大小,最后将4个卷积和image level feature 连接起来,最后再经过一个1 \times 1的卷积得到网络的输出。

def ASPP(input):
    inputs_size= tf.shape(inputs)[1:3]
    # (a) one 1x1 convolution and three 3x3 convolutions with rates = (6, 12, 18) when output stride = 16.
    # the rates are doubled when output stride = 8.
    conv_1x1 = layers_lib.conv2d(inputs, depth, [1, 1], stride=1, scope="conv_1x1")
    conv_3x3_1 = layers_lib.conv2d(inputs, depth, [3, 3], stride=1, rate=atrous_rates[0], scope='conv_3x3_1')
    conv_3x3_2 = layers_lib.conv2d(inputs, depth, [3, 3], stride=1, rate=atrous_rates[1], scope='conv_3x3_2')
    conv_3x3_3 = layers_lib.conv2d(inputs, depth, [3, 3], stride=1, rate=atrous_rates[2], scope='conv_3x3_3')

    # (b) the image-level features
    image_level_features = tf.reduce_mean(inputs, [1, 2], name='global_average_pooling', keepdims=True)  #mean with 1 and 2 dim, 0 dim keep
    # 1x1 convolution with 256 filters( and batch normalization)
    image_level_features = layers_lib.conv2d(image_level_features, depth, [1, 1], stride=1, scope='conv_1x1')
    # bilinearly upsample features
    image_level_features = tf.image.resize_bilinear(image_level_features, inputs_size, name='upsample')
    net = tf.concat([conv_1x1, conv_3x3_1, conv_3x3_2, conv_3x3_3, image_level_features], axis=3, name='concat')
    net = layers_lib.conv2d(net, depth, [1, 1], stride=1, scope='conv_1x1_concat')
return net

损失函数

网络输出的是pixel-wise的sotfmax,即为
p_k(x) = exp(a_k(x))/(\sum^K_{k=1} exp(a_k(x)))
其中,x为二维平面上的像素位置,a_k(x)表示网络最后输出层中pixel x对应的第k个通道的值。p_k(x)表示像素x属于k类的概率。

损失函数使用负类交叉熵,即为
E = \sum_{x} w(x)log(p_{l(x)}(x))
其中p_l(x)表示x在真实label所在通道上的输出概率。

步骤

1、将数据集存为tfrecord文件

writer = tf.python_io.TFRecordWriter(output_filename)

###读image
fid = tf.gfile.GFile(image_path, 'rb')
encoded_jpg = fid.read()
encoded_jpg_io = io.BytestIO(encode_jpg)
image = PIL.Image.open(encoded_jpg_io)
###读mask
fid = tf.gfile.GFile(label_path, 'rb')
encoded_label = fid.read()
encoded_label_io = io.BytestIO(encode_label)
label = PIL.Image.open(encoded_label_io)

 example = tf.train.Example(features=tf.train.Features(feature={
    'image/height': dataset_util.int64_feature(height),
    'image/width': dataset_util.int64_feature(width),
    'image/encoded': dataset_util.bytes_feature(encoded_jpg),
    'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')),
    'label/encoded': dataset_util.bytes_feature(encoded_label),
    'label/format': dataset_util.bytes_feature('png'.encode('utf8')),}))

writer.write(tf_example.SerializeToString())
writer.close()

2、定义模型

def deeplabv3_plus_model_fn(features, labels, mode, params):
    images = tf.cast(tf.map_fn(preprocessing.mean_image_addition, features), tf.uint8)#减均值
    inputs = tf.transpose(images, [0, 3, 1, 2])
    logits, end_points = base_model(inputs)
    inout_size = tf.shape(images)[1:3]
    net = end_points['/block4']
    encoder_output = ASPP(net)
    
    ###decoder
    #提取低级特征
    low_level_features = end_points['/block1/unit_s/bottleneck_v2/conv1']
    #一般低级图像特征的通道数很多(256或512)使用1*1的卷积核减输出通道数.
    low_level_features = conv2d(low_level_features, 48, [1, 1],s=1)
    low_level_feature_size = tf.shape(low_level_features)[1:3]
    #连接encoder输出和低级特征
    net = tf.image.resize_bilinear(encoder_output, low_level_features_size, name='upsample_1')
    net = tf.concat([net, low_level_features], axis=3, name='concat')
    #经过几个3*3的卷积层以增强特征方便上采样
    net = layers_lib.conv2d(net, 256, [3, 3], stride=1, scope='conv_3x3_1')
    net = layers_lib.conv2d(net, 256, [3, 3], stride=1, scope='conv_3x3_2')
    net = layers_lib.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='conv_1x1')
    #输出的逻辑值
    logits = tf.image.resize_bilinear(net, inputs_size, name='upsample_2')
    pred_classes = tf.expand_dims(tf.argmax(logits, axis=3, output_type=tf.int32), axis=3)
    return pred_classes

3、开始训练

#训练
def train():
    ###定义loss
    labels = tf.squeeze(labels, axis=3)
    logits_by_num_classes = tf.reshape(logits, [-1, params['num_classes']])
    labels_flat = tf.reshape(labels, [-1, ])
    #取label中标签有用的索引
    valid_indices = tf.to_int32(labels_flat <= params['num_classes'] - 1)#elements cluding 0 and 1
    valid_logits = tf.dynamic_partition(logits_by_num_classes, valid_indices, num_partitions=2)[1]
    valid_labels = tf.dynamic_partition(labels_flat, valid_indices, num_partitions=2)[1]
    
    pred_flat = tf.reshape(pred_classes, [-1, ])
    valid_preds = tf.dynamic_partition(preds_flat, valid_indices, num_partitions=2)[1]
    confusion_matrix = tf.confusion_matrix(valid_labels, valid_preds, num_classes=params['num_classes'])
    
    cross_entropy = tf.losses.sparse_softmax_cross_entropy(logits=valid_logits, labels=valid_labels)
    train_var_list = [v for v in tf.trainable_variables()]
    #交叉熵loss + l2正则化
    loss = cross_entropy + params.get('weight_decay', _WEIGHT_DECAY) * tf.add_n([tf.nn.l2_loss(v) for v in train_var_list])
    
    ###定义全局step
    global_step = tf.train.get_or_create_global_step()
    learning_rate = tf.train.polynomial_decay(
        params['initial_learning_rate',
        tf.cast(global_step, tf.int32)-params['initial_global_step'],
        params['max_iter'], 
        params['end_learning_rate'], 
        power=params['power'])

    tf.identity(learning_rate, name='learning_rate')
    optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params['momentum'])
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    train_op = optimizer.minimize(loss, global_step, var_list=train_var_list)
    
    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions=predictions,
        loss=loss,
        train_op=train_op,
        eval_metric_ops=metrics)

4、测试

def predict(img):
    #img = plt.imread(image_files[0])
    image = tf.convert_to_tensor(img)
    image = tf.to_float(tf.image.convert_image_dtype(image, dtype=tf.uint8))
    image.set_shape([None, None, 3])
    images = preprocessing.mean_image_subtraction(image)
    images = tf.reshape(images, [1, tf.shape(image)[0],tf.shape(image)[1],3])
    labels = tf.zeros([1, tf.shape(image)[0],tf.shape(image)[1],1])
    labels = tf.to_int32(tf.image.convert_image_dtype(labels, dtype=tf.uint8))
    predictions = deeplab_model.deeplabv3_plus_model_fn(
          images,
          labels,
          tf.estimator.ModeKeys.EVAL,
          params={
              'output_stride': FLAGS.output_stride,
              'batch_size': 1,  # Batch size must be 1 because the images' size may differ
              'base_architecture': FLAGS.base_architecture,
              'pre_trained_model': None,
              'batch_norm_decay': None,
              'num_classes': _NUM_CLASSES,
              'freeze_batch_norm': True
          }).predictions
    saver = tf.train.Saver()
    with tf.Session(config=config) as sess:
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        saver.restore(sess, 'model/model.ckpt-73536')
        preds = sess.run(predictions)
    pred = preds['classes']
    pred = pred.astype(np.float32)
    pred[pred == 1] = -1
    pred += 1
    return pred[0, :, :, 0].astype(np.uint8)

评价标准

IOU,模型产生的目标窗口和原来标记窗口的交叠率,即检测结果与ground truth的交集比上他们的并集。
IOU = \frac{detection result \bigcap ground truth}{detection result \bigcup ground truth}

训练过程效果

训练IOU
训练像素准确率
分割结果

问题及注意事项

  • 训练图很大,大部分超过了512 \times 512,如果crop_size设置为256 \times 256时会使大部分训练图全部包括天空或者前景。因此使用512 \times 512较好。另外,不用padding效果较好。

参考文献

附录

resnet101结构

conv1 (7, 7, 64, s2)          out:112*112
conv2_x (1, 1, 64)
        (3, 3, 64)       *3    out:56*56
        (1, 1, 256)
        
conv3_x (1, 1, 128)
        (3, 3, 128)      *4    out:28*28
        (1, 1, 512)

conv4_x (1, 1, 256)
        (3, 3, 256)      *23   out:14*14
        (1, 1, 1024)

conv5_x (1, 1, 512)
        (3, 3, 512)      *3    out:7*7
        (1, 1, 2048)

average pool, 1000-d fc, softmax

相关文章

网友评论

      本文标题:deeplabv3+ 算法

      本文链接:https://www.haomeiwen.com/subject/ctmycqtx.html