3.7.2 FaceNet代码
代码地址:
整个代码分为三个步骤:
- 人脸检测。对人脸进行变换和裁剪、对齐。
- 使用CNN提取128维的特征表示。
- 用特征和人脸库的特征进行比较,输出结果。
代码解析
下载Dlib的人脸landmark库文件
import bz2
import os
from urllib.request import urlopen
def download_landmarks(dst_file):
url = 'http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2'
decompressor = bz2.BZ2Decompressor()
with urlopen(url) as src, open(dst_file, 'wb') as dst:
data = src.read(1024)
while len(data) > 0:
dst.write(decompressor.decompress(data))
data = src.read(1024)
dst_dir = 'models'
dst_file = os.path.join(dst_dir, 'landmarks.dat')
if not os.path.exists(dst_file):
os.makedirs(dst_dir)
download_landmarks(dst_file)
CNN结构和训练
def create_model():
myInput = Input(shape=(96, 96, 3))
x = ZeroPadding2D(padding=(3, 3), input_shape=(96, 96, 3))(myInput)
x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1')(x)
x = BatchNormalization(axis=3, epsilon=0.00001, name='bn1')(x)
x = Activation('relu')(x)
x = ZeroPadding2D(padding=(1, 1))(x)
...
av_pool = AveragePooling2D(pool_size=(3, 3), strides=(1, 1))(inception_5b)
reshape_layer = Flatten()(av_pool)
dense_layer = Dense(128, name='dense_layer')(reshape_layer)
norm_layer = Lambda(lambda x: K.l2_normalize(x, axis=1), name='norm_layer')(dense_layer)
return Model(inputs=[myInput], outputs=norm_layer)
书面的代码会创建一个model,输入是的人脸图片,输出是128维的向量。
在写损失函数代码之前,我们先回顾下Triplet损失:
from keras import backend as K
from keras.models import Model
from keras.layers import Input, Layer
# 输入的anchor, positive和negative图片
in_a = Input(shape=(96, 96, 3))
in_p = Input(shape=(96, 96, 3))
in_n = Input(shape=(96, 96, 3))
# 输出
emb_a = nn4_small2(in_a)
emb_p = nn4_small2(in_p)
emb_n = nn4_small2(in_n)
class TripletLossLayer(Layer):
def __init__(self, alpha, **kwargs):
self.alpha = alpha
super(TripletLossLayer, self).__init__(**kwargs)
def triplet_loss(self, inputs):
a, p, n = inputs
p_dist = K.sum(K.square(a-p), axis=-1)
n_dist = K.sum(K.square(a-n), axis=-1)
return K.sum(K.maximum(p_dist - n_dist + self.alpha, 0), axis=0)
def call(self, inputs):
loss = self.triplet_loss(inputs)
self.add_loss(loss)
return loss
# 使用之前定义的TripletLossLayer
triplet_loss_layer = TripletLossLayer(alpha=0.2, name='triplet_loss_layer')
([emb_a, emb_p, emb_n])
# Model定义
nn4_small2_train = Model([in_a, in_p, in_n], triplet_loss_layer)
定义好loss之后,就要进行最难的也是最重要的Triplet选择。
from data import triplet_generator
generator = triplet_generator()
nn4_small2_train.compile(loss=None, optimizer='adam')
nn4_small2_train.fit_generator(generator, epochs=10, steps_per_epoch=100)
import numpy as np
def triplet_generator():
''' Dummy triplet generator for API usage demo only.
Will be replaced by a version that uses real image data later.
:return: a batch of (anchor, positive, negative) triplets
'''
while True:
a_batch = np.random.rand(4, 96, 96, 3)
p_batch = np.random.rand(4, 96, 96, 3)
n_batch = np.random.rand(4, 96, 96, 3)
yield [a_batch , p_batch, n_batch], None
定义训练代码
model = Sequential()
model.add(Dense(units=64, activation='relu', input_dim=100))
model.add(Dense(units=10, activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='sgd',
metrics=['accuracy'])
model.fit(x_train, y_train, epochs=5, batch_size=32)
人脸检测和对齐
图1 Landmark Detection结果 图2 Face Align
这里使用Dlib实现人脸检测,使用OpenCV实现人脸对齐。
import cv2
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from align import AlignDlib
%matplotlib inline
def load_image(path):
img = cv2.imread(path, 1)
# OpenCV的图片是BGR顺序的,我们要把它变成RGB
return img[...,::-1]
# 构造AlignDlib对象,需要传入前面下载的landmarks.dat文件
alignment = AlignDlib('models/landmarks.dat')
# 加载Jacques Chirac的一张照片
jc_orig = load_image(metadata[2].image_path())
# 进行人脸检测,返回bounding box
bb = alignment.getLargestFaceBoundingBox(jc_orig)
# 对齐,然后进行仿射变换,最后crop成96x96
jc_aligned = alignment.align(96, jc_orig, bb, landmarkIndices=AlignDlib.OUTER_EYES_AND_NOSE)
# 显示原始图像
plt.subplot(131)
plt.imshow(jc_orig)
# 显示Bounding Box
plt.subplot(132)
plt.imshow(jc_orig)
plt.gca().add_patch(patches.Rectangle((bb.left(), bb.top()), bb.width(), bb.height(),
fill=False, color='red'))
#显示对齐后的图像
plt.subplot(133)
plt.imshow(jc_aligned);
上述代码中的核心函数
def getAllFaceBoundingBoxes(self, rgbImg):
"""
找到输入图片的所有人脸的bounding boxes
:参数 rgbImg: RGB。Shape: (height, width, 3)
:类型 rgbImg: numpy.ndarray
:返回: 所有的人脸的Bouding Box
:返回类型: dlib.rectangles
"""
assert rgbImg is not None
try:
return self.detector(rgbImg, 1)
except Exception as e:
print("Warning: {}".format(e))
# In rare cases, exceptions are thrown.
return []
def getLargestFaceBoundingBox(self, rgbImg, skipMulti=False):
assert rgbImg is not None
faces = self.getAllFaceBoundingBoxes(rgbImg)
if (not skipMulti and len(faces) > 0) or len(faces) == 1:
return max(faces, key=lambda rect: rect.width() * rect.height())
else:
return None
def __init__(self, facePredictor):
self.detector = dlib.get_frontal_face_detector()
self.predictor = dlib.shape_predictor(facePredictor)
人脸对齐
def align(self, imgDim, rgbImg, bb=None,
landmarks=None, landmarkIndices=INNER_EYES_AND_BOTTOM_LIP,
skipMulti=False):
if bb is None:
bb = self.getLargestFaceBoundingBox(rgbImg, skipMulti)
if bb is None:
return
if landmarks is None:
landmarks = self.findLandmarks(rgbImg, bb)
npLandmarks = np.float32(landmarks)
npLandmarkIndices = np.array(landmarkIndices)
H = cv2.getAffineTransform(npLandmarks[npLandmarkIndices],
imgDim * MINMAX_TEMPLATE[npLandmarkIndices])
thumbnail = cv2.warpAffine(rgbImg, H, (imgDim, imgDim))
return thumbnail
生成Embedding向量
embedded = np.zeros((metadata.shape[0], 128))
for i, m in enumerate(metadata):
img = load_image(m.image_path())
img = align_image(img)
# 把0-255的RGB缩放到区间[0,1]
img = (img / 255.).astype(np.float32)
# Keras要求第一维度是batch,所以要expand_dim把img从(96,96,3)变成(1,96,96,3)
embedded[i] = nn4_small2_pretrained.predict(np.expand_dims(img, axis=0))[0]
图3 距离计算示例
最佳距离阈值。为了寻找最佳的距离阈值,我们需要有一个数据集,然后尝试不同的阈值,计算不同阈值的效果,最后寻找一个最优的阈值。评价阈值的效果有很多指标,这里我们使用F1值。(交叉验证方法得到最佳阈值,有没有自动化的方法,不要人为去搜索寻找!)
from sklearn.metrics import f1_score, accuracy_score
distances = [] # squared L2 distance between pairs
identical = [] # 1 if same identity, 0 otherwise
num = len(metadata)
for i in range(num - 1):
for j in range(1, num):
distances.append(distance(embedded[i], embedded[j]))
identical.append(1 if metadata[i].name == metadata[j].name else 0)
distances = np.array(distances)
identical = np.array(identical)
thresholds = np.arange(0.3, 1.0, 0.01)
f1_scores = [f1_score(identical, distances < t) for t in thresholds]
acc_scores = [accuracy_score(identical, distances < t) for t in thresholds]
opt_idx = np.argmax(f1_scores)
opt_tau = thresholds[opt_idx]
opt_acc = accuracy_score(identical, distances < opt_tau)
plt.plot(thresholds, f1_scores, label='F1 score');
plt.plot(thresholds, acc_scores, label='Accuracy');
plt.axvline(x=opt_tau, linestyle='--', lw=1, c='lightgrey', label='Threshold')
plt.title(f'Accuracy at threshold {opt_tau:.2f} = {opt_acc:.3f}');
plt.xlabel('Distance threshold')
plt.legend();
图4 寻找最优阈值
分类。
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
targets = np.array([m.name for m in metadata])
encoder = LabelEncoder()
encoder.fit(targets)
y = encoder.transform(targets)
train_idx = np.arange(metadata.shape[0]) % 2 != 0
test_idx = np.arange(metadata.shape[0]) % 2 == 0
# 每人5张用于训练,总共10*5=50个训练样本
X_train = embedded[train_idx]
# 每人5张用于测试,总共10*5=50个测试样本
X_test = embedded[test_idx]
y_train = y[train_idx]
y_test = y[test_idx]
knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
svc = LinearSVC()
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)
acc_knn = accuracy_score(y_test, knn.predict(X_test))
acc_svc = accuracy_score(y_test, svc.predict(X_test))
print(f'KNN accuracy = {acc_knn}, SVM accuracy = {acc_svc}')
图5 SVM分类预测
数据可视化。为了展示Face Emebedding确实是把同一个人的照片映射到Embedding空间相近的点,我们使用TSNE来进行降维。
from sklearn.manifold import TSNE
# 对embedded进行TSNE降维
X_embedded = TSNE(n_components=2).fit_transform(embedded)
# 遍历10个类别(人)
for i, t in enumerate(set(targets)):
# 找到这个人的照片(下标)
idx = targets == t
# 绘制散点图
plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=t)
plt.legend(bbox_to_anchor=(1, 1));
图6 Face Embedding的可视化
网友评论