美文网首页
3 计算机视觉-阅读笔记(9)

3 计算机视觉-阅读笔记(9)

作者: 深度学习模型优化 | 来源:发表于2019-04-30 00:50 被阅读0次

    3.7.2 FaceNet代码

    代码地址:

    整个代码分为三个步骤:

    • 人脸检测。对人脸进行变换和裁剪、对齐。
    • 使用CNN提取128维的特征表示。
    • 用特征和人脸库的特征进行比较,输出结果。

    代码解析

    下载Dlib的人脸landmark库文件

    import bz2
    import os
    
    from urllib.request import urlopen
    
    def download_landmarks(dst_file):
        url = 'http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2'
        decompressor = bz2.BZ2Decompressor()
        
        with urlopen(url) as src, open(dst_file, 'wb') as dst:
            data = src.read(1024)
            while len(data) > 0:
                dst.write(decompressor.decompress(data))
                data = src.read(1024)
    
    dst_dir = 'models'
    dst_file = os.path.join(dst_dir, 'landmarks.dat')
    
    if not os.path.exists(dst_file):
        os.makedirs(dst_dir)
        download_landmarks(dst_file)
    

    CNN结构和训练

    def create_model():
        myInput = Input(shape=(96, 96, 3))
        
        x = ZeroPadding2D(padding=(3, 3), input_shape=(96, 96, 3))(myInput)
        x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1')(x)
        x = BatchNormalization(axis=3, epsilon=0.00001, name='bn1')(x)
        x = Activation('relu')(x)
        x = ZeroPadding2D(padding=(1, 1))(x)
        ...
        av_pool = AveragePooling2D(pool_size=(3, 3), strides=(1, 1))(inception_5b)
        reshape_layer = Flatten()(av_pool)
        dense_layer = Dense(128, name='dense_layer')(reshape_layer)
        norm_layer = Lambda(lambda  x: K.l2_normalize(x, axis=1), name='norm_layer')(dense_layer)
         
        return Model(inputs=[myInput], outputs=norm_layer)
    

    书面的代码会创建一个model,输入是96 \times 96 \times 3的人脸图片,输出是128维的向量。

    在写损失函数代码之前,我们先回顾下Triplet损失:
    L = \mathop \sum_{i=1}^m \left[ \Vert f(x_i^a - x_i^p) \Vert_2^2 + \alpha - \Vert f(x_i^a - x_i^n)\Vert_2^2 \right]_+

    from keras import backend as K
    from keras.models import Model
    from keras.layers import Input, Layer
    
    # 输入的anchor, positive和negative图片
    in_a = Input(shape=(96, 96, 3))
    in_p = Input(shape=(96, 96, 3))
    in_n = Input(shape=(96, 96, 3))
    
    # 输出 
    emb_a = nn4_small2(in_a)
    emb_p = nn4_small2(in_p)
    emb_n = nn4_small2(in_n)
    
    class TripletLossLayer(Layer):
        def __init__(self, alpha, **kwargs):
            self.alpha = alpha
            super(TripletLossLayer, self).__init__(**kwargs)
        
        def triplet_loss(self, inputs):
            a, p, n = inputs
            p_dist = K.sum(K.square(a-p), axis=-1)
            n_dist = K.sum(K.square(a-n), axis=-1)
            return K.sum(K.maximum(p_dist - n_dist + self.alpha, 0), axis=0)
        
        def call(self, inputs):
            loss = self.triplet_loss(inputs)
            self.add_loss(loss)
            return loss
    
    # 使用之前定义的TripletLossLayer
    triplet_loss_layer = TripletLossLayer(alpha=0.2, name='triplet_loss_layer')
                    ([emb_a, emb_p, emb_n])
    
    # Model定义
    nn4_small2_train = Model([in_a, in_p, in_n], triplet_loss_layer)
    

    定义好loss之后,就要进行最难的也是最重要的Triplet选择。

    from data import triplet_generator
    
    generator = triplet_generator() 
    
    nn4_small2_train.compile(loss=None, optimizer='adam')
    nn4_small2_train.fit_generator(generator, epochs=10, steps_per_epoch=100)
    
    import numpy as np
    
    def triplet_generator():
        ''' Dummy triplet generator for API usage demo only.
        
        Will be replaced by a version that uses real image data later.
        
        :return: a batch of (anchor, positive, negative) triplets
        '''
        while True:
            a_batch = np.random.rand(4, 96, 96, 3)
            p_batch = np.random.rand(4, 96, 96, 3)
            n_batch = np.random.rand(4, 96, 96, 3)
            yield [a_batch , p_batch, n_batch], None
    

    定义训练代码

    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_dim=100))
    model.add(Dense(units=10, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy',
        optimizer='sgd',
        metrics=['accuracy'])
    model.fit(x_train, y_train, epochs=5, batch_size=32)
    

    人脸检测和对齐


    图1 Landmark Detection结果 图2 Face Align

    这里使用Dlib实现人脸检测,使用OpenCV实现人脸对齐。

    import cv2
    import matplotlib.pyplot as plt
    import matplotlib.patches as patches
    
    from align import AlignDlib
    
    %matplotlib inline
    
    def load_image(path):
        img = cv2.imread(path, 1)
        # OpenCV的图片是BGR顺序的,我们要把它变成RGB
        return img[...,::-1]
    
    # 构造AlignDlib对象,需要传入前面下载的landmarks.dat文件
    alignment = AlignDlib('models/landmarks.dat')
    
    # 加载Jacques Chirac的一张照片
    jc_orig = load_image(metadata[2].image_path())
    
    # 进行人脸检测,返回bounding box 
    bb = alignment.getLargestFaceBoundingBox(jc_orig)
    
    # 对齐,然后进行仿射变换,最后crop成96x96
    jc_aligned = alignment.align(96, jc_orig, bb, landmarkIndices=AlignDlib.OUTER_EYES_AND_NOSE)
    
    # 显示原始图像
    plt.subplot(131)
    plt.imshow(jc_orig)
    
    # 显示Bounding Box
    plt.subplot(132)
    plt.imshow(jc_orig)
    plt.gca().add_patch(patches.Rectangle((bb.left(), bb.top()), bb.width(), bb.height(), 
                fill=False, color='red'))
    
    #显示对齐后的图像 
    plt.subplot(133)
    plt.imshow(jc_aligned);
    

    上述代码中的核心函数

    def getAllFaceBoundingBoxes(self, rgbImg):
        """
        找到输入图片的所有人脸的bounding boxes
        :参数 rgbImg: RGB。Shape: (height, width, 3)
        :类型 rgbImg: numpy.ndarray
        :返回: 所有的人脸的Bouding Box 
        :返回类型: dlib.rectangles
        """
        assert rgbImg is not None
        
        try:
            return self.detector(rgbImg, 1)
        except Exception as e:
            print("Warning: {}".format(e))
            # In rare cases, exceptions are thrown.
            return []
    
    def getLargestFaceBoundingBox(self, rgbImg, skipMulti=False):
        assert rgbImg is not None
        
        faces = self.getAllFaceBoundingBoxes(rgbImg)
        if (not skipMulti and len(faces) > 0) or len(faces) == 1:
            return max(faces, key=lambda rect: rect.width() * rect.height())
        else:
            return None
    
    def __init__(self, facePredictor):
        self.detector = dlib.get_frontal_face_detector()
        self.predictor = dlib.shape_predictor(facePredictor)
    

    人脸对齐

    def align(self, imgDim, rgbImg, bb=None,
                landmarks=None, landmarkIndices=INNER_EYES_AND_BOTTOM_LIP,
                skipMulti=False):
    
            
            if bb is None:
                bb = self.getLargestFaceBoundingBox(rgbImg, skipMulti)
            if bb is None:
                return
            
            if landmarks is None:
                landmarks = self.findLandmarks(rgbImg, bb)
            
            npLandmarks = np.float32(landmarks)
            npLandmarkIndices = np.array(landmarkIndices)
            
            H = cv2.getAffineTransform(npLandmarks[npLandmarkIndices],
                imgDim * MINMAX_TEMPLATE[npLandmarkIndices])
            thumbnail = cv2.warpAffine(rgbImg, H, (imgDim, imgDim))
            
            return thumbnail
    

    生成Embedding向量

    embedded = np.zeros((metadata.shape[0], 128))
    
    for i, m in enumerate(metadata):
        img = load_image(m.image_path())
        img = align_image(img)
        # 把0-255的RGB缩放到区间[0,1]
        img = (img / 255.).astype(np.float32)
        # Keras要求第一维度是batch,所以要expand_dim把img从(96,96,3)变成(1,96,96,3)
        embedded[i] = nn4_small2_pretrained.predict(np.expand_dims(img, axis=0))[0]
    
    图3 距离计算示例

    最佳距离阈值。为了寻找最佳的距离阈值,我们需要有一个数据集,然后尝试不同的阈值,计算不同阈值的效果,最后寻找一个最优的阈值。评价阈值的效果有很多指标,这里我们使用F1值。(交叉验证方法得到最佳阈值,有没有自动化的方法,不要人为去搜索寻找!)

    from sklearn.metrics import f1_score, accuracy_score
    
    distances = [] # squared L2 distance between pairs
    identical = [] # 1 if same identity, 0 otherwise
    
    num = len(metadata)
    
    for i in range(num - 1):
        for j in range(1, num):
            distances.append(distance(embedded[i], embedded[j]))
            identical.append(1 if metadata[i].name == metadata[j].name else 0)
    
    distances = np.array(distances)
    identical = np.array(identical)
    
    thresholds = np.arange(0.3, 1.0, 0.01)
    
    f1_scores = [f1_score(identical, distances < t) for t in thresholds]
    acc_scores = [accuracy_score(identical, distances < t) for t in thresholds]
    
    opt_idx = np.argmax(f1_scores) 
    opt_tau = thresholds[opt_idx] 
    opt_acc = accuracy_score(identical, distances < opt_tau)
     
    plt.plot(thresholds, f1_scores, label='F1 score');
    plt.plot(thresholds, acc_scores, label='Accuracy');
    plt.axvline(x=opt_tau, linestyle='--', lw=1, c='lightgrey', label='Threshold')
    plt.title(f'Accuracy at threshold {opt_tau:.2f} = {opt_acc:.3f}');
    plt.xlabel('Distance threshold')
    plt.legend();
    
    图4 寻找最优阈值

    分类。

    from sklearn.preprocessing import LabelEncoder
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import LinearSVC
    
    targets = np.array([m.name for m in metadata])
    
    encoder = LabelEncoder()
    encoder.fit(targets)
    
    y = encoder.transform(targets)
    
    train_idx = np.arange(metadata.shape[0]) % 2 != 0
    test_idx = np.arange(metadata.shape[0]) % 2 == 0
    
    # 每人5张用于训练,总共10*5=50个训练样本
    X_train = embedded[train_idx]
    # 每人5张用于测试,总共10*5=50个测试样本
    X_test = embedded[test_idx]
    
    y_train = y[train_idx]
    y_test = y[test_idx]
    
    knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
    svc = LinearSVC()
    
    knn.fit(X_train, y_train)
    svc.fit(X_train, y_train)
    
    acc_knn = accuracy_score(y_test, knn.predict(X_test))
    acc_svc = accuracy_score(y_test, svc.predict(X_test))
    
    print(f'KNN accuracy = {acc_knn}, SVM accuracy = {acc_svc}')
    
    图5 SVM分类预测

    数据可视化。为了展示Face Emebedding确实是把同一个人的照片映射到Embedding空间相近的点,我们使用TSNE来进行降维。

    from sklearn.manifold import TSNE
    
    # 对embedded进行TSNE降维
    X_embedded = TSNE(n_components=2).fit_transform(embedded)
    
    # 遍历10个类别(人)
    for i, t in enumerate(set(targets)):
        # 找到这个人的照片(下标)
        idx = targets == t
        # 绘制散点图
        plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=t)
    
    plt.legend(bbox_to_anchor=(1, 1));
    
    图6 Face Embedding的可视化

    相关文章

      网友评论

          本文标题:3 计算机视觉-阅读笔记(9)

          本文链接:https://www.haomeiwen.com/subject/osdhnqtx.html