示例数据集:kaggle上面的癌症数据集
Histopathologic Cancer Detection
引入包及相关参数
%matplotlib inline
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
import numpy as np
import pandas as pd
import os
import copy
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as transforms
from torchvision import utils
import torch.nn.functional as F
from torch import optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchsummary import summary
# CPU or GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# dataloader里的多进程用到num_workers
workers = 0 if os.name=='nt' else 4
数据检查(探索)
# 数据探索
# 读取标签数据
path_csv = './data/bic/train_labels.csv'
labels_df = pd.read_csv(path_csv)
# 看一下数据格式
print(labels_df.head())
# 查看各类数据总数
print(labels_df['label'].value_counts())
# 查看数据对比图
labels_df['label'].hist()
# 可视化阳性一些图片,查看数据实际样子
# 取得阳性数据ids
malignant_ids = labels_df.loc[labels_df['label'] == 1]['id'].values
# 图片数据地址
path_train = './data/bic/train/'
# 用于控制图片显示颜色
# color_flag = False
color_flag = True
# 可视化
plt.rcParams['figure.figsize'] = (10, 10)
plt.subplots_adjust(wspace=0, hspace=0)
nrows, ncols = 3, 3
# 显示图片
for i,idx in enumerate(malignant_ids[:nrows*ncols]):
full_filenames = os.path.join(path_train, idx + '.tif')
# 加载图片
img = Image.open(full_filenames)
# 画图显示
draw = ImageDraw.Draw(img)
draw.rectangle(((32, 32), (64, 64)), outline='green')
plt.subplot(nrows, ncols, i+1)
if color_flag is True:
plt.imshow(np.array(img))
else:
plt.imshow(np.array(img)[:,:,0], cmap='gray')
plt.axis('off')
print(f'图片尺寸规模:{np.array(img).shape} ')
print(f'图片的像素值范围:{np.min(img)}~{np.max(img)}')
"""
id label
0 f38a6374c348f90b587e046aac6079959adf3835 0
1 c18f2d887b7ae4f6742ee445113fa1aef383ed77 1
2 755db6279dae599ebb4d39a9123cce439965282d 0
3 bc3f0c64fb968ff4a8bd33af6971ecae77c75e08 0
4 068aba587a4950175d04c680d38943fd488d6a9d 0
0 130908
1 89117
Name: label, dtype: int64
图片尺寸规模:(96, 96, 3)
图片的像素值范围:0~255
"""
data explore
构建数据集(dataset)
# 构建数据集 dataset
# 设置随机种子
torch.manual_seed(0)
# Dataset
class histoCancerDataset(Dataset):
def __init__(self, data_dir, transform, data_type='train'):
# 图片路径
path_data = os.path.join(data_dir, data_type)
# 图片名数据列表
filenames = os.listdir(path_data)
# 图片路径列表
self.full_filenames = [os.path.join(path_data, f) for f in filenames]
# 标签数据
csv_filename = data_type + '_labels.csv'
path_label_csv = os.path.join(data_dir, csv_filename)
labels_df = pd.read_csv(path_label_csv)
# 重构数据索引为id列
labels_df.set_index('id', inplace=True)
# 标签数据
self.labels = [labels_df.loc[filename[:-4]].values[0] for filename in filenames]
self.transform = transform
def __len__(self):
return len(self.full_filenames)
def __getitem__(self, idx):
image = Image.open(self.full_filenames[idx])
image = self.transform(image)
return image, self.labels[idx]
# 数据转换
data_transformer = transforms.Compose([transforms.ToTensor()])
data_dir = './data/bic/'
histo_dataset = histoCancerDataset(data_dir, data_transformer, 'train')
# 加载图片数据,查阅数据是否正常读取
img, label = histo_dataset[9]
print(img.shape, torch.min(img), torch.max(img))
print(len(histo_dataset))
# 分割数据集 ---> train_set, val_set
len_histo = len(histo_dataset)
len_train = int(0.8 * len_histo)
len_val = len_histo - len_train
# 训练验证数据集
train_ds, val_ds=random_split(histo_dataset, [len_train, len_val])
# 查看各数据集总数
print(f'训练数据集总数: {len(train_ds)}')
print(f'验证数据集总数: {len(val_ds)}')
“”“
torch.Size([3, 96, 96]) tensor(0.) tensor(1.)
220025
训练数据集总数: 176020
验证数据集总数: 44005
”“”
随机检验下各数据集里的数据情况
# 随机查看下训练数据集里图片
def show(img, label, color=False):
# tensor --> numpy.array
img_np = img.numpy()
# C*H*W convert to H*W*C
img_np_tr = np.transpose(img_np, (1, 2, 0))
# 显示图片
if color==False:
img_np_tr = img_np_tr[:,:,0]
plt.imshow(img_np_tr, interpolation='nearest', cmap='gray')
else:
plt.imshow(img_np_tr, interpolation='nearest')
plt.title('label: ' + str(label))
# 随机读取几张图片, 训练数据集,测试数据集方法一样的
grid_size = 4
random_idx = np.random.randint(0, len(train_ds), grid_size)
print(f'图片id: {random_idx}')
x_grid_train = [train_ds[i][0] for i in random_idx]
y_grid_train = [train_ds[i][1] for i in random_idx]
x_grid_train = utils.make_grid(x_grid_train, nrow=4, padding=2)
print(x_grid_train.shape)
plt.rcParams['figure.figsize'] = (10.0, 5)
show(x_grid_train, y_grid_train, True)
"""
图片id: [111514 50544 118663 62662]
torch.Size([3, 100, 394])
"""
make_grid show train_ds
定义各数据集转换 transforms
# 定义各数据集的转换规则
train_transformer = transforms.Compose([
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomVerticalFlip(p=0.5),
transforms.RandomRotation(45),
transforms.RandomResizedCrop(96,scale=(0.8,1.0),ratio=(1.0,1.0)),
transforms.ToTensor()])
val_transformer = transforms.Compose([transforms.ToTensor()])
# 重写各数据集转换器
train_ds.transform=train_transformer
val_ds.transform=val_transformer
# 构建数据加载器 dataloader
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=64, shuffle=False)
"""
# 查看一下数据加载工作流
for x, y in train_dl:
print(x.shape)
print(y.shape)
break
# torch.Size([32, 3, 96, 96])
# torch.Size([32])
for x, y in val_dl:
print(x.shape)
print(y.shape)
break
# torch.Size([64, 3, 96, 96])
# torch.Size([64])
"""
定义一些辅助函数
Dumb Baselines
# Dumb Baseline 哑变量基线
# 验证数据的标签值
y_val=[y for _,y in val_ds]
# 评价标准,准确率
def accuracy(labels, out):
return np.sum(out==labels)/float(len(labels))
# 全零预测值的准确率
acc_all_zeros=accuracy(y_val,np.zeros_like(y_val))
# 全正值预测值的准确率
acc_all_ones=accuracy(y_val,np.ones_like(y_val))
# 随机值的准确率
acc_random=accuracy(y_val,np.random.randint(2,size=len(y_val)))
print("accuracy random prediction: %.2f" %acc_random)
print("accuracy all zero prediction: %.2f" %acc_all_zeros)
print("accuracy all one prediction: %.2f" %acc_all_ones)
"""
accuracy random prediction: 0.50
accuracy all zero prediction: 0.59
accuracy all one prediction: 0.41
"""
get_conv2d_out_shape,卷积层后输出大小
# 计算卷积层后的输出大小
import torch.nn as nn
def get_conv2d_out_shape(H_in, W_in, conv, pool=2):
# get conv arguments
kernel_size = conv.kernel_size
stride = conv.stride
padding = conv.padding
dilation = conv.dilation
# Ref: https://pytorch.org/docs/stable/nn.html
H_out=np.floor((H_in+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0]+1)
W_out=np.floor((W_in+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1]+1)
if pool:
H_out/=pool
W_out/=pool
return int(H_out),int(W_out)
# 示例
conv1 = nn.Conv2d(3, 8, kernel_size=3)
h, w = get_conv2d_out_shape(96,96,conv1)
print(h,w)
# 47, 47
get_lr,取得模型中的学习率当前值
# 获取学习率方法
def get_lr(opt):
for param_group in opt.param_groups:
return param_group['lr']
metrics_batch,各批次运算过程中的正确数
# 预测正确数(与真实值比较)
def metrics_batch(output, target):
# 取得预测输出类别
pred = output.argmax(dim=1, keepdim=True)
# 预测值与真实比较
corrects = pred.eq(target.view_as(pred)).sum().item()
return corrects
loss_batch,每批次运算结果
# 每批次迭代的损失计算方法
def loss_batch(loss_func, output, target, opt=None):
# 取得损失值
loss = loss_func(output, target)
# 取得预测正确个数
metric_b = metrics_batch(output,target)
if opt is not None:
opt.zero_grad()
loss.backward()
opt.step()
return loss.item(), metric_b
loss_epoch,每轮次运算结果
# 定义每轮次损失计算 epoch
def loss_epoch(model,loss_func,dataset_dl,sanity_check=False,opt=None):
running_loss = 0.0
running_metric = 0.0
len_data = len(dataset_dl.dataset)
for xb, yb in dataset_dl:
xb = xb.to(device)
yb = yb.to(device)
output=model(xb)
# 调用每批次损失计算
loss_b,metric_b=loss_batch(loss_func, output, yb, opt)
# 更新损失值
running_loss += loss_b
# 叠加预测正确数
if metric_b is not None:
running_metric += metric_b
# 在可用性检测条件下,跳出循环,即只循环一次batch
if sanity_check is True:
break
# 计算损失平均值
loss = running_loss / float(len_data)
# 计算正确值平均
metric = running_metric / float(len_data)
return loss, metric
show_loss_acc,可视化结果
# 画出损失值与正确率
def show_loss_acc(num_epochs, loss_hist, metric_hist):
# 损失值
plt.title("Train-Val Loss")
plt.plot(range(1,num_epochs+1),loss_hist["train"],label="train")
plt.plot(range(1,num_epochs+1),loss_hist["val"],label="val")
plt.ylabel("Loss")
plt.xlabel("Training Epochs")
plt.legend()
plt.show()
# 准确率
plt.title("Train-Val Accuracy")
plt.plot(range(1,num_epochs+1),metric_hist["train"],label="train")
plt.plot(range(1,num_epochs+1),metric_hist["val"],label="val")
plt.ylabel("Accuracy")
plt.xlabel("Training Epochs")
plt.legend()
plt.show()
构建模型
# 构建模型
class Net(nn.Module):
def __init__(self, params):
super(Net, self).__init__()
# 基本参数
c_in,h_in,w_in = params["input_shape"]
init_f = params["initial_filters"]
num_fc1 = params["num_fc1"]
num_classes = params["num_classes"]
self.dropout_rate = params["dropout_rate"]
# 卷积层一
self.conv1 = nn.Conv2d(c_in, init_f, kernel_size=3)
h, w = get_conv2d_out_shape(h_in, w_in, self.conv1)
# 卷积层二
self.conv2 = nn.Conv2d(init_f, 2*init_f, kernel_size=3)
h, w = get_conv2d_out_shape(h, w, self.conv2)
# 卷积层三
self.conv3 = nn.Conv2d(2*init_f, 4*init_f, kernel_size=3)
h, w = get_conv2d_out_shape(h, w, self.conv3)
# 卷积层四
self.conv4 = nn.Conv2d(4*init_f, 8*init_f, kernel_size=3)
h, w = get_conv2d_out_shape(h, w, self.conv4)
# 计算此时共有多个元素,用于全连接层
self.num_flatten = h * w * 8 * init_f
# 全连接层一
self.fc1 = nn.Linear(self.num_flatten, num_fc1)
# 全连接层二
self.fc2 = nn.Linear(num_fc1, num_classes)
def forward(self, input):
x = F.relu(self.conv1(input))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv3(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv4(x))
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, self.num_flatten)
x = F.relu(self.fc1(x))
x = F.dropout(x, self.dropout_rate)
x = self.fc2(x)
return F.log_softmax(x, dim=1)
# 模型相关参数值
params_model={
"input_shape": (3,96,96), # C*H*W
"initial_filters": 8,
"num_fc1": 100,
"dropout_rate": 0.25,
"num_classes": 2,
}
# 模型实例
cnn_model = Net(params_model)
cnn_model = cnn_model.to(device)
print(cnn_model)
"""
Net(
(conv1): Conv2d(3, 8, kernel_size=(3, 3), stride=(1, 1))
(conv2): Conv2d(8, 16, kernel_size=(3, 3), stride=(1, 1))
(conv3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
(conv4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
(fc1): Linear(in_features=1024, out_features=100, bias=True)
(fc2): Linear(in_features=100, out_features=2, bias=True)
)
"""
# 验证是否在device上 gpu ---> cuda:0, cpu ---> cpu
print(next(cnn_model.parameters()).device)
查看模型结构及相关信息
from torchsummary import summary
summary(cnn_model, input_size=(3, 96, 96), device=device)
"""
----------------------------------------------------------------
Layer (type) Output Shape Param #
================================================================
Conv2d-1 [-1, 8, 94, 94] 224
Conv2d-2 [-1, 16, 45, 45] 1,168
Conv2d-3 [-1, 32, 20, 20] 4,640
Conv2d-4 [-1, 64, 8, 8] 18,496
Linear-5 [-1, 100] 102,500
Linear-6 [-1, 2] 202
================================================================
Total params: 127,230
Trainable params: 127,230
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.11
Forward/backward pass size (MB): 0.92
Params size (MB): 0.49
Estimated Total Size (MB): 1.51
----------------------------------------------------------------
"""
训练验证模型逻辑函数
# 构建训练验证流程
def train_val(model, params):
# 取得各参数值
num_epochs = params["num_epochs"]
loss_func = params["loss_func"]
opt = params["optimizer"]
train_dl = params["train_dl"]
val_dl = params["val_dl"]
sanity_check = params["sanity_check"]
lr_scheduler = params["lr_scheduler"]
path2weights = params["path2weights"]
# 存储每轮次损失值
loss_history = {
"train": [],
"val": [],
}
# 存储每轮次正确数
metric_history = {
"train": [],
"val": [],
}
# 深度复制的最优模型参数
best_model_wts = copy.deepcopy(model.state_dict())
# 初始化损失值
best_loss=float('inf')
# 主循环代码块
for epoch in range(num_epochs):
# 取得当前学习率值
current_lr = get_lr(opt)
print('Epoch {}/{}, current lr={}'.format(epoch, num_epochs - 1, current_lr))
# 定义为模型训练阶段
model.train()
train_loss, train_metric = loss_epoch(model,loss_func,train_dl,sanity_check,opt)
# 存储训练各轮次结果值
loss_history["train"].append(train_loss)
metric_history["train"].append(train_metric)
# 模型验证阶段
model.eval()
with torch.no_grad():
val_loss, val_metric = loss_epoch(model,loss_func,val_dl,sanity_check)
# 存储过程中最好的结果数据
if val_loss < best_loss:
best_loss = val_loss
best_model_wts = copy.deepcopy(model.state_dict())
# 将参数存储到本地
torch.save(model.state_dict(), path2weights)
print("Copied best model weights!")
# 存储验证过程各轮次结果值
loss_history["val"].append(val_loss)
metric_history["val"].append(val_metric)
# 学习率更新策略
lr_scheduler.step(val_loss)
if current_lr != get_lr(opt):
print("Loading best model weights!")
model.load_state_dict(best_model_wts)
print("train loss: %.6f, dev loss: %.6f, accuracy: %.2f" %(train_loss,val_loss,100*val_metric))
print("-"*10)
# 加载整个过程中最好的参数
model.load_state_dict(best_model_wts)
return model, loss_history, metric_history
训练模型(检验整个流程是否能正常运行,sanity_check=True)
# 定义各个模块及相关参数
loss_func = nn.NLLLoss(reduction="sum")
opt = optim.Adam(cnn_model.parameters(), lr=3e-4)
lr_scheduler = ReduceLROnPlateau(opt, mode='min',factor=0.5, patience=20,verbose=1)
params_train={
"num_epochs": 100,
"optimizer": opt,
"loss_func": loss_func,
"train_dl": train_dl,
"val_dl": val_dl,
"sanity_check": True,
"lr_scheduler": lr_scheduler,
"path2weights": "./models/bic/weights.pt",
}
# 训练及验证模型
cnn_model,loss_hist,metric_hist=train_val(cnn_model,params_train)
“”“
Epoch 0/99, current lr=0.0003
Copied best model weights!
train loss: 0.000071, dev loss: 0.000609, accuracy: 0.12
----------
Epoch 1/99, current lr=0.0003
train loss: 0.000060, dev loss: 0.000682, accuracy: 0.11
”“”
# 可视化
show_loss_acc(num_epochs, loss_hist, metric_hist)
Train-Val Loss.png
Train-Val Acc.png
实际模型训练(数据全部参与训练, sanity_check=False)
# 定义各个模块及相关参数
loss_func = nn.NLLLoss(reduction="sum")
opt = optim.Adam(cnn_model.parameters(), lr=3e-4)
lr_scheduler = ReduceLROnPlateau(opt, mode='min',factor=0.5, patience=20,verbose=1)
# 两轮次训练,且全数据训练
params_train_nor = {
"num_epochs": 2,
"optimizer": opt,
"loss_func": loss_func,
"train_dl": train_dl,
"val_dl": val_dl,
"sanity_check": False,
"lr_scheduler": lr_scheduler,
"path2weights": "./models/bic/weights.pt",
}
# 训练及验证模型
cnn_model,loss_hist,metric_hist=train_val(cnn_model,params_train_nor)
"""
Epoch 0/1, current lr=0.0003
Copied best model weights!
train loss: 0.443595, dev loss: 0.394328, accuracy: 82.81
----------
Epoch 1/1, current lr=0.0003
Copied best model weights!
train loss: 0.378927, dev loss: 0.358059, accuracy: 84.59
----------
"""
# 可视化结果
show_loss_acc(params_train_nor['num_epochs'], loss_hist, metric_hist)
Loss.png
Acc.png
网友评论