MTCNN之人脸检测——pytorch代码实现

作者: 小黄不头秃 | 来源:发表于2023-06-17 18:36 被阅读0次

MTCNN网络是多任务级联卷积神经网络，基于级联的特定目标检测器，在人脸识别中有着出色的表现。由P、R、O三个网络构成。常用于目标检测的算法还有SSD（单激多发探测器）、RCNN、YOLO等

如果对MTCNN的基础知识不太清楚，可以参考我的文章：
MTCNN之基础知识笔记 - 简书 (jianshu.com)

一、网络结构

MTCNN由三个子网络构成：P网络、R网络、O网络
P网络的输入是12*12，R网络输入是24*24，O网络输入是48*48。P网络注重从大量数据中筛选有效信息，R网络对P网络的筛选结果做二次筛选，O网络注重识别的精度。

（1）P网络（3层）

图像金字塔和滑动窗口把一张图片分割成了很多张图片，所以P网络面对的数据量是最大的。

P网络就相当于是一个12*12的卷积核，对大量图片进行筛选。并且P网络是全卷积结构，这就说明P网络的输入可以是大小不固定的图片。

（2）R网络（4层）

使用三层卷积层后接一个全连接层。这里与P网络不一样。然而P网络是全卷积结构，为什么会造成这样的差异呢？因为我们并不清楚一张图片中有多少个人脸，图片大小也不一样，自然不能使用全连接层。但是R网络接收的是P网络的输出结果，输入的大小是固定的。所以这里并没有使用全卷积结构，而是使用传统的但目标识别网络结构。

（3）O网络（5层）

是三个网络中参数最多，网络层数最深的，因为到了最后一步，我们需要对目标进行更加精确的识别，所以相对神经网络的能力要求更高。

二、实现步骤

下载数据集
训练P网络、制作训练数据、保存模型
训练R网络、制作训练数据、保存模型
训练O网络、制作训练数据、保存模型
将三个网络结合起来使用。

模型代码实现（仅供参考）：

import torch 
import torch.nn as nn
from PIL import Image
import torchvision

class PNet(nn.Module):
    def __init__(self):
        super(PNet, self).__init__()
        self.layers = nn.Sequential(
            nn.Conv2d(3,10,3,1,bias=False),nn.BatchNorm2d(10),nn.ReLU(), nn.MaxPool2d(2), # [1, 10, 5, 5]
            nn.Conv2d(10,16,3,1,bias=False),nn.BatchNorm2d(16),nn.ReLU(), # [1, 16, 3, 3]
            nn.Conv2d(16,32,3,1), # [1, 5, 1, 1]
        )

        self.cond = nn.Conv2d(32,1,1,1)
        self.offset = nn.Conv2d(32,4,1,1)
 
    def forward(self, x):
        y = self.layers(x)
        # print()
        category = self.cond(y)
        offset = self.offset(y)
        category = torch.sigmoid(category.float())
        return category, offset
"""
R网络
"""
class RNet(nn.Module):
    def __init__(self):
        super(RNet, self).__init__()
        self.layer = nn.Sequential(
            nn.Conv2d(3,28,3,1,1,bias=False),nn.BatchNorm2d(28),nn.ReLU(),
            nn.MaxPool2d(3,stride=2),
            nn.Conv2d(28,48,3,1,0,bias=False),nn.BatchNorm2d(48),nn.ReLU(),
            nn.MaxPool2d(3,stride=2),
            nn.Conv2d(48,64,2,1,bias=False),nn.BatchNorm2d(64),nn.ReLU()
        )

        self.fc1 = nn.Sequential(
            nn.Linear(64*3*3, 128),nn.ReLU(),
            nn.Linear(128,1)
        )

        self.fc2 = nn.Sequential(
            nn.Linear(64*3*3, 128),nn.ReLU(),
            nn.Linear(128,4)
        )

    def forward(self, x):
        y = self.layer(x)
        y = y.reshape(-1, 64*3*3)
        class_out = torch.sigmoid(self.fc1(y))
        bbox_out = self.fc2(y)
        return class_out, bbox_out 

"""
O网络
"""
class ONet(nn.Module):
    def __init__(self):
        super(ONet, self).__init__()
        self.layer = nn.Sequential(
            nn.Conv2d(3,32,3,1,0,bias=False),nn.BatchNorm2d(32),nn.ReLU(),
            nn.MaxPool2d(3,stride=2,padding=1),
            nn.Conv2d(32,64,3,1,0,bias=False),nn.BatchNorm2d(64),nn.ReLU(),
            nn.MaxPool2d(3,stride=2),
            nn.Conv2d(64,64,3,1,0,bias=False),nn.BatchNorm2d(64),nn.ReLU(),
            nn.MaxPool2d(2,stride=2),
            nn.Conv2d(64,128,2,1,bias=False),nn.BatchNorm2d(128),nn.ReLU()
        )
        self.fc1 = nn.Sequential(
            nn.Linear(128*3*3, 256),nn.ReLU(),
            nn.Linear(256,1)
        )

        self.fc2 = nn.Sequential(
            nn.Linear(128*3*3, 256),nn.ReLU(),
            nn.Linear(256,4)
        )

    def forward(self, x):
        y = self.layer(x)
        y = y.reshape(-1, 128*3*3)
        class_out = torch.sigmoid(self.fc1(y))
        bbox_out = self.fc2(y)
        return class_out, bbox_out

二、数据集制作

在实验的过程中使用的是WIDER FACE数据集。
关于数据集：可以在这里下载：celeba官网或者 WIDER FACE: A Face Detection Benchmark

在制作训练神经网络的数据的时候，利用图片大小和建议框的IOU，将数据划分为负样本、正样本和地标：

0 - 0.3，为负样本
0.3 - 0.4 为地标
0.4 - 0.65 为部分样本
0.65 - 1.0 为正样本
训练样本的比例：负样本：正样本：部分样本：地标 = 3：1：1：2.

地标不参与训练过程，地标的作用是拉大正负样本之间的距离。
这里得看你的数据集是什么样的。

数据准备时应注意：

我们在准备数据集的时候，一定要让负样本的数量远多于正样本的数量和部分样本的数量，
IOU阈值不能设置过小，也不能设置过大。设置较极端会导致泛化性能不够。导致损失降不下去，准确率不够高。
数据量一定要够，数据量不够的话，一些不是人脸的物体容易被误识别
如果缺失亚洲人人脸，可以手动加入一些数据

这里我提供了两种代码将人脸数据集转换成MTCNN能训练的数据结构的方法：Celeb A & Wider Face

数据集制作代码(仅参考)：

"""
celeb A 数据集转换成12，24，48大小的训练数据
"""
import numpy as np 
import os 
from PIL import Image 
from utils import IOU, NMS
import torch

TARGET_PATH = "./data/celeba/Datasets/CelebA/Anno/"
IMG_PATH = "./data/celeba/Datasets/CelebA/Img/img_align_celeba/img_align_celeba"
DST_PATH = "./data/train/"

label_file_path = TARGET_PATH + "list_bbox_celeba.txt"

# 首先先生成对应的文件夹
if not os.path.exists(DST_PATH):
    os.mkdir(DST_PATH)

# 生成不同尺寸的人脸样本：负样本：正样本：部分样本 = 3：1：2
for face_size in (12,24,48):
    base_path = DST_PATH + f"{face_size}/"
    if not os.path.exists(base_path):
        os.mkdir(base_path)
    if not os.path.exists(base_path+"positive"):
        os.mkdir(base_path+"positive")
    if not os.path.exists(base_path+"negative"):
        os.mkdir(base_path+"negative")
    if not os.path.exists(base_path+"part"):
        os.mkdir(base_path+"part")
    
    # 样本标签存储路径
    positive_filename = base_path+"positive.txt"
    negative_filename = base_path+"negative.txt"
    part_filename = base_path+"part.txt"

    # 计数值
    positive_counter = 0
    negative_counter = 0
    part_counter = 0
    
    try:
        positive_file = open(positive_filename, "w")
        negative_file = open(negative_filename, "w")
        part_file = open(part_filename, "w")

        for i, line in enumerate(open(label_file_path)):
            print(f"positive:{positive_counter}, negative:{negative_counter}, part:{part_counter}")
            if i<2: continue 
            try:
                strs = line.strip().split(" ") # 删除两边的空格 ['000001.jpg', '', '', '', '95', '', '71', '226', '313']
                strs = list(filter(bool, strs))# 过滤序列
                
                image_filename = strs[0]
                image_filepath = IMG_PATH+image_filename

                with Image.open(image_filepath) as img:
                    img_w,img_h = img.size 
                    x1 = float(strs[1].strip())
                    y1 = float(strs[2].strip())
                    w = float(strs[3].strip())
                    h = float(strs[4].strip())
                    x2 = float(x1 + w)
                    y2 = float(y1 + h)

                    # 过滤字段
                    if max(w,h)<40 or x1<0 or y1<0 or w<0 or h<0: continue 

                    # 标注不标准，给框适当的偏移量
                    x1 = int(x1 + w*0.12)
                    y1 = int(y1 + h*0.1)
                    x2 = int(x1 + w*0.9)
                    y2 = int(y1 + h*0.85)
                    w = int(x2-x1)
                    h = int(y2-y1)

                    boxes = [[x1, y1, x2, y2]]
                    cx = w/2 + x1
                    cy = h/2 + y1

                    # 让正样本和部分样本随着图片中心进行随机偏移
                    for _ in range(5):
                        w_ = np.random.randint(-w*0.2, w*0.2)
                        h_ = np.random.randint(-h*0.2, h*0.2)
                        cx_ = cx + w_
                        cy_ = cy + h_

                        side_len = np.random.randint(round(min(w,h)*0.8), round(max(w,h)*1.25))
                        x1_ = np.max(cx_-side_len/2, 0)
                        y1_ = np.max(cy_-side_len/2, 0)
                        x2_ = x1_ + side_len
                        y2_ = y1_ + side_len

                        crop_box = np.array([x1_, y1_, x2_, y2_])

                        # 计算坐标偏移量
                        offset_x1 = (x1 - x1_) / side_len # δ = (x1-x1_)/side_len
                        offset_y1 = (y1 - y1_) / side_len
                        offset_x2 = (x2 - x2_) / side_len
                        offset_y2 = (y2 - y2_) / side_len

                        face_crop = img.crop(crop_box)
                        face_resize = face_crop.resize((face_size, face_size), Image.ANTIALIAS)

                        iou = IOU(torch.tensor(boxes), torch.tensor(crop_box))

                        if iou > 0.6: # 正样本
                            positive_file.write(f"positive/{ positive_counter}.jpg 1 {offset_x1} {offset_y1} {offset_x2} {offset_y2}")
                            positive_file.flush()
                            face_resize.save(base_path+f"positive/{ part_counter}.jpg")
                            positive_counter += 1
                        elif iou>0.4: # 部分样本
                            part_file.write(f"part/{ part_counter}.jpg 2 {offset_x1} {offset_y1} {offset_x2} {offset_y2}")
                            part_file.flush()
                            face_resize.save(base_path+f"part/{ part_counter}.jpg")
                            part_counter += 1
                        elif iou < 0.1: # 负样本
                            negative_file.write(f"negative/{ negative_counter}.jpg 0 0 0 0 0")
                            negative_file.flush()
                            face_resize.save(base_path+f"negative/{ negative_counter}.jpg")
                            negative_counter += 1
                    
                    _boxes = torch.tensor(boxes)

                    for _ in range(10): # 这里是为了能够生成足够的负样本
                        side_len = np.random.randint(face_size, min(img_w, img_h)/2)
                        x_ = np.random.randint(0,img_w - side_len)
                        y_ = np.random.randint(0,img_h - side_len)
                        crop_box = np.array([x_, y_, x_+side_len, y_+side_len])

                        if np.max(IOU(torch.tensor(crop_box),_boxes)) < 0.15:
                            face_crop = img.crop(crop_box)
                            face_resize = face_crop.resize((face_size, face_size), Image.ANTIALIAS)
                            
                            negative_file.write(f"negative/{ negative_counter}.jpg 0 0 0 0 0")
                            negative_file.flush()
                            face_resize.save(base_path+f"negative/{ negative_counter}.jpg")
                            negative_counter += 1

            except Exception as e:
                pass 
    except Exception as e:
        pass

"""
wider-face 数据集 生成12*12的训练数据
"""
import os
import sys
sys.path.append("./")
# print(os.getcwd())
import cv2
from PIL import Image
import numpy as np
from tqdm import tqdm
from utils.utils import IOU2 
import torch

TARGET_PATH = "./dataset/"
IMG_PATH = "./dataset/WIDER_train/images/"
DST_PATH = "./dataset/"
face_size = 12

label_file_path = TARGET_PATH + "wider_face_train.txt"

# 首先先生成对应的文件夹
if not os.path.exists(DST_PATH+f"{face_size}"):
    os.mkdir(DST_PATH+"12")

# pos，part,neg裁剪图片放置位置
pos_save_dir = os.path.join(DST_PATH, f'{face_size}/positive')
part_save_dir = os.path.join(DST_PATH, f'{face_size}/part')
neg_save_dir = os.path.join(DST_PATH, f'{face_size}/negative')
# PNet数据地址
save_dir = os.path.join(DST_PATH, f'{face_size}/')

# 创建文件夹
if not os.path.exists(save_dir):
    os.mkdir(save_dir)
if not os.path.exists(pos_save_dir):
    os.mkdir(pos_save_dir)
if not os.path.exists(part_save_dir):
    os.mkdir(part_save_dir)
if not os.path.exists(neg_save_dir):
    os.mkdir(neg_save_dir)

# 生成后的数据列表文件
positive_file = open(os.path.join(save_dir, 'positive.txt'), 'w')
negative_file = open(os.path.join(save_dir, 'negative.txt'), 'w')
part_file = open(os.path.join(save_dir, 'part.txt'), 'w')

# 原数据集的列表文件
with open(label_file_path, 'r') as f:
    annotations = f.readlines()
num = len(annotations)
print('总共的图片数： %d' % num)

# 记录pos,neg,part三类生成数
positive_counter = 0
negative_counter = 0
part_counter = 0

# 记录读取的图片数
idx = 0
for anno in tqdm(annotations):
    anno = anno.strip().split(' ')
    img_name = anno[0]
    img_path = IMG_PATH+img_name+".jpg"
    # 获取人脸的box所有坐标
    box = list(map(float, anno[1:]))
    # 把所有box的坐标按照4分割，里面是左上角和右下角的信息
    boxes = np.array(box, dtype=np.float32).reshape(-1, 4)
    w = box[2] - box[0]
    h = box[3] - box[1]
    
    # 已经获取了坐标信息下面对图像进行操作
    with Image.open(img_path) as img:
        idx += 1
        img_w, img_h = img.size

        # 过滤字段
        if max(w,h)<40 or box[0]<0 or box[1]<0 or w<0 or h<0:  continue 
        # 标注不标准，给框适当的偏移量
        x1 = int(box[0] + w*0.12)
        y1 = int(box[1] + h*0.1)
        x2 = int(box[0] + w*0.9)
        y2 = int(box[1] + h*0.85)
        w = int(x2-x1)
        h = int(y2-y1)

        boxes = [[x1, y1, x2, y2]]
        cx = x1 + w/2
        cy = y1 + h/2

        # 查看框是否合适
        # img = cv2.imread(img_path)
        # img = cv2.rectangle(img, (x1,y1), (x2,y2),(0,0,255),2)
        # cv2.imshow("img",img)
        # cv2.waitKey(0)

        # 让正样本和部分样本随着图片中心进行随机偏移
        for _ in range(5):
            w_ = np.random.randint(-w*0.2, w*0.2)
            h_ = np.random.randint(-h*0.2, h*0.2)
            cx_ = cx + w_
            cy_ = cy + h_

            side_len = np.random.randint(round(min(w,h)*0.8), round(max(w,h)*1.25))
            x1_ = np.max(cx_-side_len/2, 0)
            y1_ = np.max(cy_-side_len/2, 0)
            x2_ = x1_ + side_len
            y2_ = y1_ + side_len

            crop_box = np.array([x1_, y1_, x2_, y2_])

            # 计算坐标偏移量
            offset_x1 = (x1 - x1_) / side_len # δ = (x1-x1_)/side_len
            offset_y1 = (y1 - y1_) / side_len
            offset_x2 = (x2 - x2_) / side_len
            offset_y2 = (y2 - y2_) / side_len

            face_crop = img.crop(crop_box)
            face_resize = face_crop.resize((face_size, face_size), Image.ANTIALIAS)

            iou = IOU2(torch.tensor(boxes), torch.tensor(crop_box)).item()

            if iou > 0.6: # 正样本
                positive_file.write(f"positive/{ positive_counter}.jpg 1 {offset_x1} {offset_y1} {offset_x2} {offset_y2}\n")
                positive_file.flush()
                face_resize.save(pos_save_dir+f"/{positive_counter}.jpg")
                positive_counter += 1
            elif iou>0.4: # 部分样本
                part_file.write(f"part/{ part_counter}.jpg 2 {offset_x1} {offset_y1} {offset_x2} {offset_y2}\n")
                part_file.flush()
                face_resize.save(part_save_dir+f"/{ part_counter}.jpg")
                part_counter += 1
            elif iou < 0.1: # 负样本
                negative_file.write(f"negative/{ negative_counter}.jpg 0 0 0 0 0\n")
                negative_file.flush()
                face_resize.save(neg_save_dir+f"/{ negative_counter}.jpg")
                negative_counter += 1
        
        _boxes = torch.tensor(boxes[0])

        for _ in range(10): # 这里是为了能够生成足够的负样本
            side_len = np.random.randint(face_size, min(img_w, img_h)/2)
            x_ = np.random.randint(0,img_w - side_len)
            y_ = np.random.randint(0,img_h - side_len)
            crop_box = np.array([x_, y_, x_+side_len, y_+side_len])

            if IOU2(torch.tensor(crop_box),_boxes).item() < 0.15:
                face_crop = img.crop(crop_box)
                face_resize = face_crop.resize((face_size, face_size), Image.ANTIALIAS)
                
                negative_file.write(f"negative/{ negative_counter}.jpg 0 0 0 0 0\n")
                negative_file.flush()
                face_resize.save(neg_save_dir+f"/{ negative_counter}.jpg")
                negative_counter += 1

数据集制作的代码实现：

import torch 
from torch.utils.data import DataLoader, Dataset 
from torchvision.transforms import ToTensor
from PIL import Image

class MTCNN_Dataset(Dataset):
    def __init__(self, path) -> None:
        super().__init__()
        self.path = path 
        self.dataset = []
        self.trans = ToTensor()
        self.dataset.extend(open(self.path + "/positive.txt"))
        self.dataset.extend(open(self.path + "/negative.txt"))
        self.dataset.extend(open(self.path + "/part.txt"))

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        strs = self.dataset[index].strip().split(" ")
        img_path = self.path + f"/{strs[0]}"
        cls = float(strs[1])
        offset = list(map(float, strs[2:]))
        img = Image.open(img_path)
        img = self.trans(img)
        return img, torch.tensor([cls]), torch.tensor(offset)

三、模型训练

模型在训练过程中有一个重要的点，就是我们需要计算两个损失，一个是置信度的损失，一个是偏移量的损失。关于置信度的损失，我们使用的是sigmoid激活函数 + BCELoss的方式，偏移量的损失使用MSELoss。

在计算置信度损失的时候，我们仅需要一个批次中的正样本和负样本来计算损失（不需要部分样本）。
在计算偏移量损失的时候，我们仅需要一个批次中的正样本和部分样本，而不需要负样本。

利用标签中的对正样本(1)、负样本(0)、部分样本(2)的置信度标识可以实现。例如：小于2的数据就是正样本和负样本，大于0的是正样本和部分样本。

关于三个网络的网络结构和模型训练的代码，我的另外的笔记中有写，这里给出链接，以下代码仅供参考：
P网络结构的pytorch代码实现和训练代码的实现可以参考我的笔记：
MTCNN网络之P网络——pytorch代码实现 - 简书 (jianshu.com)

R网络结构的pytorch代码实现和训练代码的实现可以参考我的笔记：
MTCNN之R网络——pytorch代码实现 - 简书 (jianshu.com)

O网络结构的pytorch代码实现和训练代码的实现可以参考我的笔记：
MTCNN网络之O网络——pytorch代码实现 - 简书 (jianshu.com)

损失函数代码实现：

class ClassLoss(nn.Module):
    def __init__(self):
        super(ClassLoss, self).__init__()
        self.entropy_loss = nn.CrossEntropyLoss(ignore_index=-100, reduction='none')
        self.keep_ratio = 0.7

    def forward(self, class_out, label):
        # 保留neg 0 和pos 1 的数据，忽略掉part -1, landmark -2
        label = torch.where(label < 0, -100, label)
        # 求neg 0 和pos 1 的数据70%数据
        valid_label = torch.where(label >= 0, 1, 0)
        num_valid = torch.sum(valid_label)
        keep_num = int((num_valid * self.keep_ratio).cpu().numpy())
        label = torch.squeeze(label)
        # 计算交叉熵损失
        loss = self.entropy_loss(input=class_out, target=label)
        # 取有效数据的70%计算损失
        loss, _ = torch.topk(torch.squeeze(loss), k=keep_num)
        return torch.mean(loss)


class BBoxLoss(nn.Module):
    def __init__(self):
        super(BBoxLoss, self).__init__()
        self.square_loss = nn.MSELoss(reduction='none')
        self.keep_ratio = 1.0

    def forward(self, bbox_out, bbox_target, label):
        # 保留pos 1 和part -1 的数据
        valid_label = torch.where(torch.abs(label) == 1, 1, 0)
        valid_label = torch.squeeze(valid_label)
        # 获取有效值的总数
        keep_num = int(torch.sum(valid_label).cpu().numpy() * self.keep_ratio)
        loss = self.square_loss(input=bbox_out, target=bbox_target)
        loss = torch.sum(loss, dim=1)
        loss = loss.cuda() * valid_label
        # 取有效数据计算损失
        loss, _ = torch.topk(loss, k=keep_num, dim=0)
        return torch.mean(loss)

模型训练代码实现(以P网络为例)：

# 设置优化方法
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001, weight_decay=1e-4)

# 获取学习率衰减函数
scheduler = MultiStepLR(optimizer, milestones=[6, 14, 20], gamma=0.1)

# 获取损失函数
class_loss = ClassLoss()
bbox_loss = BBoxLoss()
landmark_loss = LandmarkLoss()

# 开始训练
for epoch in range(epoch_num):
    for batch_id, (img, label, bbox, landmark) in enumerate(train_loader):
        img = img.to(device)
        label = label.to(device).long()
        bbox = bbox.to(device)
        landmark = landmark.to(device)
        class_out, bbox_out, landmark_out = model(img)
        cls_loss = class_loss(class_out, label)
        box_loss = bbox_loss(bbox_out, bbox, label)
        landmarks_loss = landmark_loss(landmark_out, landmark, label)
        total_loss = radio_cls_loss * cls_loss + radio_bbox_loss * box_loss + radio_landmark_loss * landmarks_loss
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()
        if batch_id % 100 == 0:
            acc = accuracy(class_out, label)
            print('[%s] Train epoch %d, batch %d, total_loss: %f, cls_loss: %f, box_loss: %f, landmarks_loss: %f, '
                  'accuracy：%f' % (datetime.now(), epoch, batch_id, total_loss, cls_loss, box_loss, landmarks_loss, acc))
    scheduler.step()

    # 保存模型
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    torch.jit.save(torch.jit.script(model), os.path.join(model_path, 'PNet.pth'))

四、模型测试

（1）图片识别代码实现：

import argparse 
import os 
import cv2 
from PIL import Image, ImageDraw, ImageFont
from torchvision import transforms
import numpy as np 
import torch 
from models.PNet import PNet
from models.ONet import ONet
from models.RNet import RNet
from utils.utils import NMS, convert2square

font = cv2.FONT_HERSHEY_DUPLEX  # 设置字体

parser = argparse.ArgumentParser()
parser.add_argument('--model_path', type=str, default='./param/',      help='PNet、RNet、ONet三个模型文件存在的文件夹路径')
parser.add_argument('--p_cls', type=float, default=0.7,      help='参数')
parser.add_argument('--p_nms', type=float, default=0.4,      help='参数')
parser.add_argument('--r_cls', type=float, default=0.8,      help='参数')
parser.add_argument('--r_nms', type=float, default=0.4,      help='参数')
parser.add_argument('--o_cls', type=float, default=0.8,      help='参数')
parser.add_argument('--o_nms', type=float, default=0.1,      help='参数')
args = parser.parse_args()

device = "cuda" if torch.cuda.is_available() else "cpu"

class Detecter:
    def __init__(self) -> None:
        # 获取P模型
        self.pnet = PNet()
        self.pnet.load_state_dict(torch.load(args.model_path + "pnet.pt"))
        self.pnet.to(device)
        self.pnet.eval()

        # 获取R模型
        self.rnet = RNet()
        self.rnet.load_state_dict(torch.load(args.model_path + "rnet.pt"))
        self.rnet.to(device)
        self.rnet.eval()

        # 获取R模型
        self.onet = ONet()
        self.onet.load_state_dict(torch.load(args.model_path + "onet.pt"))
        self.onet.to(device)
        self.onet.eval()

        self._image_transform = transforms.ToTensor()

    def detect_img(self, image):
        # P网络检测
        pnet_boxes = self.__pnet_detect(image)
        if pnet_boxes.shape[0] == 0:
            return np.array([])
        # return pnet_boxes
        
        # R网络检测
        rnet_boxes = self.__rnet_detect(image, pnet_boxes)
        if rnet_boxes.shape[0] == 0:
            return np.array([])
        # return rnet_boxes
        
        # O网络检测
        onet_boxes = self.__onet_detect(image, rnet_boxes)
        if onet_boxes.shape[0] == 0:
            return np.array([])
        return onet_boxes
    
    def __pnet_detect(self, image):
        boxes = []
        img = image 
        w,h = img.size 
        min_side_len = min(w, h)

        scale = 1
        while min_side_len > 12:
            img_data = self._image_transform(img).to(device)
            img_data.unsqueeze_(0) # 将图片升维

            cls, offset = self.pnet(img_data)
            cls = cls[0][0].cpu().data # [995, 1495]
            offset = offset[0].cpu().data # [4, 995, 1495]

            idxs = torch.nonzero(torch.gt(cls, args.p_cls)) # 获取置信度>0.6的值索引 [7277, 2]

            for idx in idxs:
                boxes.append(self.__box(idx, offset, cls[idx[0], idx[1]], scale)) # __box是偏移量的反算函数

            scale *= 0.8 
            _w = int(w*scale)
            _h = int(h*scale)

            img = img.resize((_w, _h))
            min_side_len = min(_h, _w)
        # print(len(boxes))
        # exit()
        boxes = NMS(torch.tensor(boxes),thre=args.p_nms)
        return boxes
    
    def __rnet_detect(self, image, p_box):
        _img_dataset = []
        suqare_box = convert2square(np.array(p_box[:,1:]))
        for _box in suqare_box:
            # 从原图上切割图片
            _x1 = int(_box[0]) 
            _y1 = int(_box[1]) 
            _x2 = int(_box[2]) 
            _y2 = int(_box[3])

            img = image.crop((_x1, _y1, _x2, _y2)) 
            img = img.resize((24,24))
            img_data = self._image_transform(img)
            _img_dataset.append(img_data)
        img_dataset = torch.stack(_img_dataset).to(device)

        # R网络检测
        _cls, _offset = self.rnet(img_dataset)
        cls = _cls.cpu().data
        offset = _offset.cpu().data

        boxes = []
        idxs, _ = np.where(cls > args.r_cls)
        for idx in idxs:
            _box = suqare_box[idx]

            _x1 = int(_box[0]) 
            _y1 = int(_box[1]) 
            _x2 = int(_box[2]) 
            _y2 = int(_box[3])

            ow = _x2 - _x1
            oh = _y2 - _y1

            x1 = _x1 + ow * offset[idx][0]
            y1 = _y1 + oh * offset[idx][1]
            x2 = _x2 + ow * offset[idx][2]
            y2 = _y2 + oh * offset[idx][3]

            boxes.append([cls[idx][0], x1,y1,x2,y2])
        boxes = NMS(torch.tensor(boxes),thre=args.r_nms,isMin=True)
        return boxes
    
    def __onet_detect(self, image, r_box):
        _img_dataset = []
        suqare_box = convert2square(np.array(r_box[:,1:]))
        for _box in suqare_box:
            # 从原图上切割图片
            _x1 = int(_box[0]) 
            _y1 = int(_box[1]) 
            _x2 = int(_box[2]) 
            _y2 = int(_box[3])

            img = image.crop((_x1, _y1, _x2, _y2)) 
            # img.show()
            img = img.resize((48,48))
            img_data = self._image_transform(img)
            _img_dataset.append(img_data)
        img_dataset = torch.stack(_img_dataset).to(device)

        # O网络检测
        _cls, _offset = self.onet(img_dataset)
        cls = _cls.cpu().data
        offset = _offset.cpu().data

        boxes = []
        idxs, _ = np.where(cls > args.o_cls)
        for idx in idxs:
            # _box = r_box[idx][1:]
            _box = suqare_box[idx]

            _x1 = int(_box[0]) 
            _y1 = int(_box[1]) 
            _x2 = int(_box[2]) 
            _y2 = int(_box[3])

            ow = _x2 - _x1
            oh = _y2 - _y1

            x1 = _x1 + ow * offset[idx][0]
            y1 = _y1 + oh * offset[idx][1]
            x2 = _x2 + ow * offset[idx][2]
            y2 = _y2 + oh * offset[idx][3]

            boxes.append([cls[idx][0], x1,y1,x2,y2])
        boxes = NMS(torch.tensor(boxes),thre=args.o_nms, isMin=True)
        return boxes

    def __box(self, idx, offset, cls, scale, stride=2, side_len=12): # P网络池化步长为2
        # 计算的是原格子的坐标
        _x1 = (idx[1].float() * stride) / scale
        _y1 = (idx[0].float() * stride) / scale
        _x2 = (idx[1].float() * stride + side_len - 1) / scale
        _y2 = (idx[0].float() * stride + side_len - 1) / scale

        ow = _x2 - _x1
        oh = _y2 - _y1
        
        _offset = offset[:, idx[0], idx[1]] # 某点的偏移量 [4]

        x1 = _x1 + ow * _offset[0]
        y1 = _y1 + oh * _offset[1]
        x2 = _x2 + ow * _offset[2]
        y2 = _y2 + oh * _offset[3]

        return [cls, x1,y1,x2,y2]

if __name__ == "__main__":
    img_path = "./dataset/0.jpg"
    img = Image.open(img_path)
    de = Detecter()
    boxes = de.detect_img(img)
    # print(boxes.shape)

    img2 = cv2.imread(img_path)
    for box in boxes:
        img2 = cv2.rectangle(img2, (int(box[1]), int(box[2])), (int(box[3]), int(box[4])),(0,0,255),2)
        cls = round(box[0].item(),2)
        img2 = cv2.putText(img2, f"{cls}", (int(box[1]), int(box[2])), font, 1, (255, 0, 0), 2)
        # break
    img2 = cv2.resize(img2, (1000, 1000))
    cv2.imshow("img", img2)
    cv2.waitKey(0)

（2）开启相机，进行实时检测。

from face_detect import Detecter 
import cv2 
from PIL import Image 

font = cv2.FONT_HERSHEY_DUPLEX  # 设置字体

if __name__ == "__main__":
    de = Detecter()
    capture = cv2.VideoCapture(0)
    while True:
        ret, frame = capture.read()
        frame = cv2.flip(frame, 1)
        img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        boxes = de.detect_img(img)
        for box in boxes:
            frame = cv2.rectangle(frame, (int(box[1]), int(box[2])), (int(box[3]), int(box[4])),(0,0,255),2)
            cls = round(box[0].item(),2)
            frame = cv2.putText(frame, f"{cls}", (int(box[1]), int(box[2])), font, 1, (254, 0, 0), 2)
        cv2.imshow("Face_Detect", frame)
        cv2.waitKey(50)
    capture.release()
    cv2.destroyAllWindows()

实验结果：

图片检测

相机实时检测，码是后期打上的

本文主要为理解MTCNN的核心步骤，代码仅供参考，具体代码可参照：Pytorch-MTCNN: 基于Pytorch实现的MTCNN模型，人脸检测，人脸关键点检测。 (gitee.com)

MTCNN之人脸检测——pytorch代码实现

一、网络结构

（1）P网络（3层）

（2）R网络（4层）

（3）O网络（5层）

二、实现步骤

二、数据集制作

三、模型训练

四、模型测试

（1）图片识别代码实现：

相关文章

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读