deeplab
Encoder-Decoder with Atrous convolution(空洞卷积)
空洞卷积能够通过深度卷积神经网络控制特征的像素,调节卷积核的视觉域以获取多尺度的信息。对于两维的信号,有
其中空洞率r是采样输入信号的步长.
深度可分离卷积是depthwise conv(每个输入通道的单独的空间卷积)加上pointwise(结合depthwise的输出)。
Deeplabv3 as encoder用Logit前面一个特征作为encoder的输出,这个特征包括256个通道和丰富的语义特征。
decoder:encoder的输出首先经过因子为4的双线性差值上采样,然后与相应的低级特征连接,之后经过几个的卷积层。
Atrous Spatial Pyramid Pooling
有四个平行的不同的空洞卷积相连,不同的空洞率获取不同尺度的信息。
ASPP包括两个部分:多尺度空洞卷积和图像级特征。多尺度空洞卷积包括,的普通卷积,
空洞率为6的空洞卷积,
空洞率为12的空洞卷积,
空洞率为18的空洞卷积;图像级特征,对输入在[1,2]维上求均值,经过
的普通卷积,再使用线性差值resize到输入图像的大小,最后将4个卷积和image level feature 连接起来,最后再经过一个
的卷积得到网络的输出。
def ASPP(input):
inputs_size= tf.shape(inputs)[1:3]
# (a) one 1x1 convolution and three 3x3 convolutions with rates = (6, 12, 18) when output stride = 16.
# the rates are doubled when output stride = 8.
conv_1x1 = layers_lib.conv2d(inputs, depth, [1, 1], stride=1, scope="conv_1x1")
conv_3x3_1 = layers_lib.conv2d(inputs, depth, [3, 3], stride=1, rate=atrous_rates[0], scope='conv_3x3_1')
conv_3x3_2 = layers_lib.conv2d(inputs, depth, [3, 3], stride=1, rate=atrous_rates[1], scope='conv_3x3_2')
conv_3x3_3 = layers_lib.conv2d(inputs, depth, [3, 3], stride=1, rate=atrous_rates[2], scope='conv_3x3_3')
# (b) the image-level features
image_level_features = tf.reduce_mean(inputs, [1, 2], name='global_average_pooling', keepdims=True) #mean with 1 and 2 dim, 0 dim keep
# 1x1 convolution with 256 filters( and batch normalization)
image_level_features = layers_lib.conv2d(image_level_features, depth, [1, 1], stride=1, scope='conv_1x1')
# bilinearly upsample features
image_level_features = tf.image.resize_bilinear(image_level_features, inputs_size, name='upsample')
net = tf.concat([conv_1x1, conv_3x3_1, conv_3x3_2, conv_3x3_3, image_level_features], axis=3, name='concat')
net = layers_lib.conv2d(net, depth, [1, 1], stride=1, scope='conv_1x1_concat')
return net
损失函数
网络输出的是pixel-wise的sotfmax,即为
其中,x为二维平面上的像素位置,表示网络最后输出层中pixel x对应的第k个通道的值。
表示像素x属于k类的概率。
损失函数使用负类交叉熵,即为
其中表示x在真实label所在通道上的输出概率。
步骤
1、将数据集存为tfrecord文件
writer = tf.python_io.TFRecordWriter(output_filename)
###读image
fid = tf.gfile.GFile(image_path, 'rb')
encoded_jpg = fid.read()
encoded_jpg_io = io.BytestIO(encode_jpg)
image = PIL.Image.open(encoded_jpg_io)
###读mask
fid = tf.gfile.GFile(label_path, 'rb')
encoded_label = fid.read()
encoded_label_io = io.BytestIO(encode_label)
label = PIL.Image.open(encoded_label_io)
example = tf.train.Example(features=tf.train.Features(feature={
'image/height': dataset_util.int64_feature(height),
'image/width': dataset_util.int64_feature(width),
'image/encoded': dataset_util.bytes_feature(encoded_jpg),
'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')),
'label/encoded': dataset_util.bytes_feature(encoded_label),
'label/format': dataset_util.bytes_feature('png'.encode('utf8')),}))
writer.write(tf_example.SerializeToString())
writer.close()
2、定义模型
def deeplabv3_plus_model_fn(features, labels, mode, params):
images = tf.cast(tf.map_fn(preprocessing.mean_image_addition, features), tf.uint8)#减均值
inputs = tf.transpose(images, [0, 3, 1, 2])
logits, end_points = base_model(inputs)
inout_size = tf.shape(images)[1:3]
net = end_points['/block4']
encoder_output = ASPP(net)
###decoder
#提取低级特征
low_level_features = end_points['/block1/unit_s/bottleneck_v2/conv1']
#一般低级图像特征的通道数很多(256或512)使用1*1的卷积核减输出通道数.
low_level_features = conv2d(low_level_features, 48, [1, 1],s=1)
low_level_feature_size = tf.shape(low_level_features)[1:3]
#连接encoder输出和低级特征
net = tf.image.resize_bilinear(encoder_output, low_level_features_size, name='upsample_1')
net = tf.concat([net, low_level_features], axis=3, name='concat')
#经过几个3*3的卷积层以增强特征方便上采样
net = layers_lib.conv2d(net, 256, [3, 3], stride=1, scope='conv_3x3_1')
net = layers_lib.conv2d(net, 256, [3, 3], stride=1, scope='conv_3x3_2')
net = layers_lib.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='conv_1x1')
#输出的逻辑值
logits = tf.image.resize_bilinear(net, inputs_size, name='upsample_2')
pred_classes = tf.expand_dims(tf.argmax(logits, axis=3, output_type=tf.int32), axis=3)
return pred_classes
3、开始训练
#训练
def train():
###定义loss
labels = tf.squeeze(labels, axis=3)
logits_by_num_classes = tf.reshape(logits, [-1, params['num_classes']])
labels_flat = tf.reshape(labels, [-1, ])
#取label中标签有用的索引
valid_indices = tf.to_int32(labels_flat <= params['num_classes'] - 1)#elements cluding 0 and 1
valid_logits = tf.dynamic_partition(logits_by_num_classes, valid_indices, num_partitions=2)[1]
valid_labels = tf.dynamic_partition(labels_flat, valid_indices, num_partitions=2)[1]
pred_flat = tf.reshape(pred_classes, [-1, ])
valid_preds = tf.dynamic_partition(preds_flat, valid_indices, num_partitions=2)[1]
confusion_matrix = tf.confusion_matrix(valid_labels, valid_preds, num_classes=params['num_classes'])
cross_entropy = tf.losses.sparse_softmax_cross_entropy(logits=valid_logits, labels=valid_labels)
train_var_list = [v for v in tf.trainable_variables()]
#交叉熵loss + l2正则化
loss = cross_entropy + params.get('weight_decay', _WEIGHT_DECAY) * tf.add_n([tf.nn.l2_loss(v) for v in train_var_list])
###定义全局step
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.train.polynomial_decay(
params['initial_learning_rate',
tf.cast(global_step, tf.int32)-params['initial_global_step'],
params['max_iter'],
params['end_learning_rate'],
power=params['power'])
tf.identity(learning_rate, name='learning_rate')
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params['momentum'])
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
train_op = optimizer.minimize(loss, global_step, var_list=train_var_list)
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
loss=loss,
train_op=train_op,
eval_metric_ops=metrics)
4、测试
def predict(img):
#img = plt.imread(image_files[0])
image = tf.convert_to_tensor(img)
image = tf.to_float(tf.image.convert_image_dtype(image, dtype=tf.uint8))
image.set_shape([None, None, 3])
images = preprocessing.mean_image_subtraction(image)
images = tf.reshape(images, [1, tf.shape(image)[0],tf.shape(image)[1],3])
labels = tf.zeros([1, tf.shape(image)[0],tf.shape(image)[1],1])
labels = tf.to_int32(tf.image.convert_image_dtype(labels, dtype=tf.uint8))
predictions = deeplab_model.deeplabv3_plus_model_fn(
images,
labels,
tf.estimator.ModeKeys.EVAL,
params={
'output_stride': FLAGS.output_stride,
'batch_size': 1, # Batch size must be 1 because the images' size may differ
'base_architecture': FLAGS.base_architecture,
'pre_trained_model': None,
'batch_norm_decay': None,
'num_classes': _NUM_CLASSES,
'freeze_batch_norm': True
}).predictions
saver = tf.train.Saver()
with tf.Session(config=config) as sess:
ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
saver.restore(sess, 'model/model.ckpt-73536')
preds = sess.run(predictions)
pred = preds['classes']
pred = pred.astype(np.float32)
pred[pred == 1] = -1
pred += 1
return pred[0, :, :, 0].astype(np.uint8)
评价标准
IOU,模型产生的目标窗口和原来标记窗口的交叠率,即检测结果与ground truth的交集比上他们的并集。
训练过程效果



问题及注意事项
- 训练图很大,大部分超过了
,如果crop_size设置为
时会使大部分训练图全部包括天空或者前景。因此使用
较好。另外,不用padding效果较好。
参考文献
附录
resnet101结构
conv1 (7, 7, 64, s2) out:112*112
conv2_x (1, 1, 64)
(3, 3, 64) *3 out:56*56
(1, 1, 256)
conv3_x (1, 1, 128)
(3, 3, 128) *4 out:28*28
(1, 1, 512)
conv4_x (1, 1, 256)
(3, 3, 256) *23 out:14*14
(1, 1, 1024)
conv5_x (1, 1, 512)
(3, 3, 512) *3 out:7*7
(1, 1, 2048)
average pool, 1000-d fc, softmax
网友评论