3 计算机视觉-阅读笔记（9）

作者: 深度学习模型优化 | 来源:发表于2019-04-30 00:50 被阅读0次

3.7.2 FaceNet代码

代码地址：

整个代码分为三个步骤：

人脸检测。对人脸进行变换和裁剪、对齐。
使用CNN提取128维的特征表示。
用特征和人脸库的特征进行比较，输出结果。

代码解析

下载Dlib的人脸landmark库文件

import bz2
import os

from urllib.request import urlopen

def download_landmarks(dst_file):
    url = 'http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2'
    decompressor = bz2.BZ2Decompressor()
    
    with urlopen(url) as src, open(dst_file, 'wb') as dst:
        data = src.read(1024)
        while len(data) > 0:
            dst.write(decompressor.decompress(data))
            data = src.read(1024)

dst_dir = 'models'
dst_file = os.path.join(dst_dir, 'landmarks.dat')

if not os.path.exists(dst_file):
    os.makedirs(dst_dir)
    download_landmarks(dst_file)

CNN结构和训练

def create_model():
    myInput = Input(shape=(96, 96, 3))
    
    x = ZeroPadding2D(padding=(3, 3), input_shape=(96, 96, 3))(myInput)
    x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1')(x)
    x = BatchNormalization(axis=3, epsilon=0.00001, name='bn1')(x)
    x = Activation('relu')(x)
    x = ZeroPadding2D(padding=(1, 1))(x)
    ...
    av_pool = AveragePooling2D(pool_size=(3, 3), strides=(1, 1))(inception_5b)
    reshape_layer = Flatten()(av_pool)
    dense_layer = Dense(128, name='dense_layer')(reshape_layer)
    norm_layer = Lambda(lambda  x: K.l2_normalize(x, axis=1), name='norm_layer')(dense_layer)
     
    return Model(inputs=[myInput], outputs=norm_layer)

书面的代码会创建一个model，输入是 $96 \times 96 \times 3$ 的人脸图片，输出是128维的向量。

在写损失函数代码之前，我们先回顾下Triplet损失：
$L = \mathop \sum_{i=1}^m \left[ \Vert f(x_i^a - x_i^p) \Vert_2^2 + \alpha - \Vert f(x_i^a - x_i^n)\Vert_2^2 \right]_+$

from keras import backend as K
from keras.models import Model
from keras.layers import Input, Layer

# 输入的anchor, positive和negative图片
in_a = Input(shape=(96, 96, 3))
in_p = Input(shape=(96, 96, 3))
in_n = Input(shape=(96, 96, 3))

# 输出 
emb_a = nn4_small2(in_a)
emb_p = nn4_small2(in_p)
emb_n = nn4_small2(in_n)

class TripletLossLayer(Layer):
    def __init__(self, alpha, **kwargs):
        self.alpha = alpha
        super(TripletLossLayer, self).__init__(**kwargs)
    
    def triplet_loss(self, inputs):
        a, p, n = inputs
        p_dist = K.sum(K.square(a-p), axis=-1)
        n_dist = K.sum(K.square(a-n), axis=-1)
        return K.sum(K.maximum(p_dist - n_dist + self.alpha, 0), axis=0)
    
    def call(self, inputs):
        loss = self.triplet_loss(inputs)
        self.add_loss(loss)
        return loss

# 使用之前定义的TripletLossLayer
triplet_loss_layer = TripletLossLayer(alpha=0.2, name='triplet_loss_layer')
                ([emb_a, emb_p, emb_n])

# Model定义
nn4_small2_train = Model([in_a, in_p, in_n], triplet_loss_layer)

定义好loss之后，就要进行最难的也是最重要的Triplet选择。

from data import triplet_generator

generator = triplet_generator() 

nn4_small2_train.compile(loss=None, optimizer='adam')
nn4_small2_train.fit_generator(generator, epochs=10, steps_per_epoch=100)

import numpy as np

def triplet_generator():
    ''' Dummy triplet generator for API usage demo only.
    
    Will be replaced by a version that uses real image data later.
    
    :return: a batch of (anchor, positive, negative) triplets
    '''
    while True:
        a_batch = np.random.rand(4, 96, 96, 3)
        p_batch = np.random.rand(4, 96, 96, 3)
        n_batch = np.random.rand(4, 96, 96, 3)
        yield [a_batch , p_batch, n_batch], None

定义训练代码

model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=100))
model.add(Dense(units=10, activation='softmax'))

model.compile(loss='categorical_crossentropy',
    optimizer='sgd',
    metrics=['accuracy'])
model.fit(x_train, y_train, epochs=5, batch_size=32)

人脸检测和对齐

图1 Landmark Detection结果

图2 Face Align

这里使用Dlib实现人脸检测，使用OpenCV实现人脸对齐。

import cv2
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from align import AlignDlib

%matplotlib inline

def load_image(path):
    img = cv2.imread(path, 1)
    # OpenCV的图片是BGR顺序的，我们要把它变成RGB
    return img[...,::-1]

# 构造AlignDlib对象，需要传入前面下载的landmarks.dat文件
alignment = AlignDlib('models/landmarks.dat')

# 加载Jacques Chirac的一张照片
jc_orig = load_image(metadata[2].image_path())

# 进行人脸检测，返回bounding box 
bb = alignment.getLargestFaceBoundingBox(jc_orig)

# 对齐，然后进行仿射变换，最后crop成96x96
jc_aligned = alignment.align(96, jc_orig, bb, landmarkIndices=AlignDlib.OUTER_EYES_AND_NOSE)

# 显示原始图像
plt.subplot(131)
plt.imshow(jc_orig)

# 显示Bounding Box
plt.subplot(132)
plt.imshow(jc_orig)
plt.gca().add_patch(patches.Rectangle((bb.left(), bb.top()), bb.width(), bb.height(), 
            fill=False, color='red'))

#显示对齐后的图像 
plt.subplot(133)
plt.imshow(jc_aligned);

上述代码中的核心函数

def getAllFaceBoundingBoxes(self, rgbImg):
    """
    找到输入图片的所有人脸的bounding boxes
    :参数 rgbImg: RGB。Shape: (height, width, 3)
    :类型 rgbImg: numpy.ndarray
    :返回: 所有的人脸的Bouding Box 
    :返回类型: dlib.rectangles
    """
    assert rgbImg is not None
    
    try:
        return self.detector(rgbImg, 1)
    except Exception as e:
        print("Warning: {}".format(e))
        # In rare cases, exceptions are thrown.
        return []

def getLargestFaceBoundingBox(self, rgbImg, skipMulti=False):
    assert rgbImg is not None
    
    faces = self.getAllFaceBoundingBoxes(rgbImg)
    if (not skipMulti and len(faces) > 0) or len(faces) == 1:
        return max(faces, key=lambda rect: rect.width() * rect.height())
    else:
        return None

def __init__(self, facePredictor):
    self.detector = dlib.get_frontal_face_detector()
    self.predictor = dlib.shape_predictor(facePredictor)

人脸对齐

def align(self, imgDim, rgbImg, bb=None,
            landmarks=None, landmarkIndices=INNER_EYES_AND_BOTTOM_LIP,
            skipMulti=False):

        
        if bb is None:
            bb = self.getLargestFaceBoundingBox(rgbImg, skipMulti)
        if bb is None:
            return
        
        if landmarks is None:
            landmarks = self.findLandmarks(rgbImg, bb)
        
        npLandmarks = np.float32(landmarks)
        npLandmarkIndices = np.array(landmarkIndices)
        
        H = cv2.getAffineTransform(npLandmarks[npLandmarkIndices],
            imgDim * MINMAX_TEMPLATE[npLandmarkIndices])
        thumbnail = cv2.warpAffine(rgbImg, H, (imgDim, imgDim))
        
        return thumbnail

生成Embedding向量

embedded = np.zeros((metadata.shape[0], 128))

for i, m in enumerate(metadata):
    img = load_image(m.image_path())
    img = align_image(img)
    # 把0-255的RGB缩放到区间[0,1]
    img = (img / 255.).astype(np.float32)
    # Keras要求第一维度是batch，所以要expand_dim把img从(96,96,3)变成(1,96,96,3)
    embedded[i] = nn4_small2_pretrained.predict(np.expand_dims(img, axis=0))[0]

图3 距离计算示例

最佳距离阈值。为了寻找最佳的距离阈值，我们需要有一个数据集，然后尝试不同的阈值，计算不同阈值的效果，最后寻找一个最优的阈值。评价阈值的效果有很多指标，这里我们使用F1值。（交叉验证方法得到最佳阈值，有没有自动化的方法，不要人为去搜索寻找！）

from sklearn.metrics import f1_score, accuracy_score

distances = [] # squared L2 distance between pairs
identical = [] # 1 if same identity, 0 otherwise

num = len(metadata)

for i in range(num - 1):
    for j in range(1, num):
        distances.append(distance(embedded[i], embedded[j]))
        identical.append(1 if metadata[i].name == metadata[j].name else 0)

distances = np.array(distances)
identical = np.array(identical)

thresholds = np.arange(0.3, 1.0, 0.01)

f1_scores = [f1_score(identical, distances < t) for t in thresholds]
acc_scores = [accuracy_score(identical, distances < t) for t in thresholds]

opt_idx = np.argmax(f1_scores) 
opt_tau = thresholds[opt_idx] 
opt_acc = accuracy_score(identical, distances < opt_tau)
 
plt.plot(thresholds, f1_scores, label='F1 score');
plt.plot(thresholds, acc_scores, label='Accuracy');
plt.axvline(x=opt_tau, linestyle='--', lw=1, c='lightgrey', label='Threshold')
plt.title(f'Accuracy at threshold {opt_tau:.2f} = {opt_acc:.3f}');
plt.xlabel('Distance threshold')
plt.legend();

图4 寻找最优阈值

分类。

from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

targets = np.array([m.name for m in metadata])

encoder = LabelEncoder()
encoder.fit(targets)

y = encoder.transform(targets)

train_idx = np.arange(metadata.shape[0]) % 2 != 0
test_idx = np.arange(metadata.shape[0]) % 2 == 0

# 每人5张用于训练，总共10*5=50个训练样本
X_train = embedded[train_idx]
# 每人5张用于测试，总共10*5=50个测试样本
X_test = embedded[test_idx]

y_train = y[train_idx]
y_test = y[test_idx]

knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
svc = LinearSVC()

knn.fit(X_train, y_train)
svc.fit(X_train, y_train)

acc_knn = accuracy_score(y_test, knn.predict(X_test))
acc_svc = accuracy_score(y_test, svc.predict(X_test))

print(f'KNN accuracy = {acc_knn}, SVM accuracy = {acc_svc}')

图5 SVM分类预测

数据可视化。为了展示Face Emebedding确实是把同一个人的照片映射到Embedding空间相近的点，我们使用TSNE来进行降维。

from sklearn.manifold import TSNE

# 对embedded进行TSNE降维
X_embedded = TSNE(n_components=2).fit_transform(embedded)

# 遍历10个类别(人)
for i, t in enumerate(set(targets)):
    # 找到这个人的照片(下标)
    idx = targets == t
    # 绘制散点图
    plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=t)

plt.legend(bbox_to_anchor=(1, 1));

图6 Face Embedding的可视化

3 计算机视觉-阅读笔记（9）

3.7.2 FaceNet代码

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读