https://zhuanlan.zhihu.com/p/64781896
1. 数据集预处理
import torch
import pandas as pd
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib_inline # 以%开头的表示魔术方法,直接把绘图显示在jupyter notebook页面中。如果移植到pycharm中,该语句应该改为plt.show()
# 1. 数据预处理
data = pd.read_csv('./HR.csv')
# print(data.info()) # 显示有那些类
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 14999 entries, 0 to 14998
# Data columns (total 10 columns):
# # Column Non-Null Count Dtype
# --- ------ -------------- -----
# 0 satisfaction_level 14999 non-null float64
# 1 last_evaluation 14999 non-null float64
# 2 number_project 14999 non-null int64
# 3 average_montly_hours 14999 non-null int64
# 4 time_spend_company 14999 non-null int64
# 5 Work_accident 14999 non-null int64
# 6 left 14999 non-null int64
# 7 promotion_last_5years 14999 non-null int64
# 8 part 14999 non-null object # opbject代表python object
# 9 salary 14999 non-null object
# dtypes: float64(2), int64(6), object(2)
# memory usage: 1.1+ MB
# None
# print(data.part.unique()) #显示有那些部门
# ['sales' 'accounting' 'hr' 'technical' 'support' 'management' 'IT' 'product_mng' 'marketing' 'RandD']
# object文本信息,需要数值化。
# print(data.salary.unique()) #显示有那些收入分类
# ['low' 'medium' 'high']
# print(data.groupby(['salary','part']).size()) # 先按照salrary分组,后按照part部门分组,然后统计数量
# salary part
# high IT 83
# RandD 51
# accounting 74
# hr 45
# management 225
# marketing 80
# product_mng 68
# sales 269
# support 141
# technical 201
# low IT 609
# RandD 364
# accounting 358
# hr 335
# management 180
# marketing 402
# product_mng 451
# sales 2099
# support 1146
# technical 1372
# medium IT 535
# RandD 372
# accounting 335
# hr 359
# management 225
# marketing 376
# product_mng 383
# sales 1772
# support 942
# technical 1147
# dtype: int64
# pd.get_dummies() 方法,会将object类型的数据变成独热编码
# print(pd.get_dummies(data.salary))
# high low medium
# 0 0 1 0
# 1 0 0 1
# 2 0 0 1
# 3 0 1 0
# 4 0 1 0
# ... ... ... ...
# 14994 0 1 0
# 14995 0 1 0
# 14996 0 1 0
# 14997 0 1 0
# 14998 0 1 0
#
# [14999 rows x 3 columns]
data = data.join(pd.get_dummies(data.salary)) # 将分类后的3列join到data数据集
# print(data.info) # 在原有的数据集后面增加3列,high/low/medium
# bound method DataFrame.info of satisfaction_level last_evaluation number_project ... high low medium
# 0 0.38 0.53 2 ... 0 1 0
# 1 0.80 0.86 5 ... 0 0 1
# 2 0.11 0.88 7 ... 0 0 1
# 3 0.72 0.87 5 ... 0 1 0
# 4 0.37 0.52 2 ... 0 1 0
# ... ... ... ... ... ... ... ...
# 14994 0.40 0.57 2 ... 0 1 0
# 14995 0.37 0.48 2 ... 0 1 0
# 14996 0.37 0.53 2 ... 0 1 0
# 14997 0.11 0.96 6 ... 0 1 0
# 14998 0.37 0.52 2 ... 0 1 0
del data['salary'] # 删除原有的salary这一列
data = data.join(pd.get_dummies(data.part)) # 同样的方法处理part这一列
del data['part']
# print(data.info) #[14999 rows x 21 columns]>
# 获取Y,员工是否离职
# print(data.left.value_counts()) # 该数据并非均匀分,离职的只占一小部分, 模型预测为全部不离职的概率 若等于11428/16998, 则
# 该模型没有任何用处,只不过碰巧而已。 如果模型准确率没有高于11428/16998,则没有任何用处
# 0 11428
# 1 3571
# Name: left, dtype: int64
Y_data = data.left.values.reshape(-1,1) # 第2维是1,第1维自动计算
# print(Y_data.shape) # (14999, 1)
Y = torch.from_numpy(Y_data).type(torch.FloatTensor)
X_data = data[[c for c in data.columns if c != 'left']].values #双中括号取值
X = torch.from_numpy(X_data).type(torch.FloatTensor)
# print(X.shape) #torch.Size([14999, 20])
2. 创建多层感知机模型
# 2. 自定义模型
# 创建模型
# 自定义类型
# nn.Module: 继承之歌类
# __init__: 初始化所有的层
# forward: 定义模型运算过程(前向传播过程)
class Model(nn.Module):
def __init__(self):
super().__init__() #继承父类的所有属性
self.linear_1 = nn.Linear(20,64) # 输入X有20列,因此有20个特征。 假如中间的隐藏层有64个特征,即创建64个特征的中间层
self.linear_2 = nn.Linear(64,64) # 第2层的输入就是第1层的输出,第一层隐藏层有64个特征,因此第2层的输入特征为64,中间的隐藏层还是64
self.linear_3 = nn.Linear(64,1) # 第3层的输入是第2层的输出,还是64 . 输出用于逻辑回归,因此输出特征是1,二分类
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self,input): # forward函数去调用这些层
x = self.linear_1(input) # 第一层调用
x = self.relu(x) # 调用以后,要通过relu激活
x = self.linear_2(x)
x = self.relu(x)
x = self.linear_3(x)
x = self.sigmoid(x)
return x
model = Model()
# print(model)
# # Model(
# # (linear_1): Linear(in_features=20, out_features=64, bias=True)
# # (linear_2): Linear(in_features=64, out_features=64, bias=True)
# # (linear_3): Linear(in_features=64, out_features=1, bias=True)
# # (relu): ReLU()
# # (sigmoid): Sigmoid()
# # )
- 改写模型与解释
# 3. 模型改写与解释
import torch.nn.functional as F
# F.relu(x) 直接只有函数式调用激活函数。 F本身也包含层的概念
# F.sigmoid(x)
# F 相对于nn,是稍微低阶一点的api
# 改写后的代码,看起来更简洁
class Model(nn.Module):
def __init__(self):
super().__init__() #继承父类的所有属性
self.linear_1 = nn.Linear(20,64) # 输入X有20列,因此有20个特征。 假如中间的隐藏层有64个特征,即创建64个特征的中间层
self.linear_2 = nn.Linear(64,64) # 第2层的输入就是第1层的输出,第一层隐藏层有64个特征,因此第2层的输入特征为64,中间的隐藏层还是64
self.linear_3 = nn.Linear(64,1) # 第3层的输入是第2层的输出,还是64 . 输出用于逻辑回归,因此输出特征是1,二分类
def forward(self,input): # forward函数去调用这些层
x = F.relu(self.linear_1(input) ) # 第一层调用
x = F.relu(self.linear_2(x))
x = F.sigmoid(self.linear_3(x))
return x
model = Model()
# print(model) # 发现相比较修改前,少了relu层与sigmod层 20个输入特征;隐藏层64个
# # Model(
# # (linear_1): Linear(in_features=20, out_features=64, bias=True)
# # (linear_2): Linear(in_features=64, out_features=64, bias=True)
# # (linear_3): Linear(in_features=64, out_features=1, bias=True)
# # )
lr = 0.0001
def get_model():
model = Model()
opt = torch.optim.Adam(model.parameters(),lr=lr) # 优化方法
return model, opt
model,optim = get_model()
4. 定义损失函数
# 4. 定义损失函数
loss_fn = nn.BCELoss()
batch = 64
no_of_batches = len(data)//batch
epochs = 100
for epoch in range(epochs):
for i in range(no_of_batches):
start = i*batch
end = start + batch
x = X[start:end]
y = Y[start:end]
y_pred = model(x)
loss = loss_fn(y_pred,y)
optim.zero_grad()
loss.backward()
optim.step()
with torch.no_grad():
print('epoch: ',epoch, 'loss: ',loss_fn(model(X),Y).data.item())
# epoch: 0 loss: 0.7169897556304932
# epoch: 1 loss: 0.7474398612976074
# epoch: 2 loss: 0.7510539889335632
# epoch: 3 loss: 0.7487813830375671
# epoch: 4 loss: 0.7463597655296326
# epoch: 5 loss: 0.7404094934463501
# epoch: 6 loss: 0.7263101935386658
# epoch: 7 loss: 0.7180029153823853
# epoch: 8 loss: 0.703711211681366
# epoch: 9 loss: 0.686420738697052
# epoch: 10 loss: 0.6773249506950378
# epoch: 11 loss: 0.6685119271278381
# epoch: 12 loss: 0.6593860387802124
# epoch: 13 loss: 0.6474876999855042
# epoch: 14 loss: 0.6377690434455872
# epoch: 15 loss: 0.6245443820953369
# epoch: 16 loss: 0.6224199533462524
# epoch: 17 loss: 0.6104769110679626
# epoch: 18 loss: 0.6039217114448547
# epoch: 19 loss: 0.5974984169006348
# epoch: 20 loss: 0.591426432132721
# epoch: 21 loss: 0.5852314829826355
# epoch: 22 loss: 0.5801721215248108
# epoch: 23 loss: 0.6171563267707825
# epoch: 24 loss: 0.5970718860626221
# epoch: 25 loss: 0.5865799784660339
# epoch: 26 loss: 0.5799813866615295
# epoch: 27 loss: 0.5736305713653564
# epoch: 28 loss: 0.5689525008201599
# epoch: 29 loss: 0.5653705596923828
# epoch: 30 loss: 0.5621749758720398
# epoch: 31 loss: 0.5603551268577576
# epoch: 32 loss: 0.5558130741119385
# epoch: 33 loss: 0.554741621017456
# epoch: 34 loss: 0.5537745952606201
# epoch: 35 loss: 0.5537812113761902
# epoch: 36 loss: 0.5566486716270447
# epoch: 37 loss: 0.5538275241851807
# epoch: 38 loss: 0.5533592700958252
# epoch: 39 loss: 0.5531929731369019
# epoch: 40 loss: 0.5532180666923523
# epoch: 41 loss: 0.5534858107566833
# epoch: 42 loss: 0.5535756349563599
# epoch: 43 loss: 0.5538690090179443
# epoch: 44 loss: 0.5541990995407104
# epoch: 45 loss: 0.5545604228973389
# epoch: 46 loss: 0.555756688117981
# epoch: 47 loss: 0.5562994480133057
# epoch: 48 loss: 0.5558972358703613
# epoch: 49 loss: 0.5573424100875854
# epoch: 50 loss: 0.5569452047348022
# epoch: 51 loss: 0.5557860136032104
# epoch: 52 loss: 0.5561386346817017
# epoch: 53 loss: 0.5558918118476868
# epoch: 54 loss: 0.5567163228988647
# epoch: 55 loss: 0.5555370450019836
# epoch: 56 loss: 0.5590798854827881
# epoch: 57 loss: 0.5561874508857727
# epoch: 58 loss: 0.5567173361778259
# epoch: 59 loss: 0.5563571453094482
# epoch: 60 loss: 0.5565804243087769
# epoch: 61 loss: 0.5580364465713501
# epoch: 62 loss: 0.5550894141197205
# epoch: 63 loss: 0.5556345582008362
# epoch: 64 loss: 0.5544098615646362
# epoch: 65 loss: 0.554726243019104
# epoch: 66 loss: 0.5676969885826111
# epoch: 67 loss: 0.5639616847038269
# epoch: 68 loss: 0.5534506440162659
# epoch: 69 loss: 0.5482341647148132
# epoch: 70 loss: 0.5490630269050598
# epoch: 71 loss: 0.550289511680603
# epoch: 72 loss: 0.558487057685852
# epoch: 73 loss: 0.5513705611228943
# epoch: 74 loss: 0.5461589097976685
# epoch: 75 loss: 0.5472497940063477
# epoch: 76 loss: 0.5445483922958374
# epoch: 77 loss: 0.5462327599525452
# epoch: 78 loss: 0.5444859862327576
# epoch: 79 loss: 0.5440321564674377
# epoch: 80 loss: 0.5433013439178467
# epoch: 81 loss: 0.5420659184455872
# epoch: 82 loss: 0.5504337549209595
# epoch: 83 loss: 0.5655510425567627
# epoch: 84 loss: 0.5413082242012024
# epoch: 85 loss: 0.5385186672210693
# epoch: 86 loss: 0.5376941561698914
# epoch: 87 loss: 0.5360188484191895
# epoch: 88 loss: 0.5352768898010254
# epoch: 89 loss: 0.5340187549591064
# epoch: 90 loss: 0.532996416091919
# epoch: 91 loss: 0.5334175229072571
# epoch: 92 loss: 0.5310117602348328
# epoch: 93 loss: 0.5338006615638733
# epoch: 94 loss: 0.5294851064682007
# epoch: 95 loss: 0.5323388576507568
# epoch: 96 loss: 0.5423325300216675
# epoch: 97 loss: 0.5269168615341187
# epoch: 98 loss: 0.5257655382156372
# epoch: 99 loss: 0.5224730372428894
5. 使用dataset进行重构
# 使用dataset进行重构
# PyTorch有一个抽象的Dataset类。Dataset可以是任何具有__len__函数和__getitem__作为对其进行索引的方法的函数。 本教程将通过示例将自定义HRDataset类创建为的Dataset的子类。
# PyTorch的TensorDataset 是一个包装张量的Dataset。通过定义索引的长度和方式,这也为我们提供了沿张量的第一维进行迭代,索引和切片的方法。这将使我们在训练的同一行中更容易访问自变量和因变量
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
import pandas as pd
data = pd.read_csv('./HR.csv')
data = data.join(pd.get_dummies(data.salary)) # 将分类后的3列join到data数据集
del data['salary'] # 删除原有的salary这一列
data = data.join(pd.get_dummies(data.part)) # 同样的方法处理part这一列
del data['part']
Y_data = data.left.values.reshape(-1,1) # 第2维是1,第1维自动计算
Y = torch.from_numpy(Y_data).type(torch.FloatTensor)
X_data = data[[c for c in data.columns if c != 'left']].values #双中括号取值
X = torch.from_numpy(X_data).type(torch.FloatTensor)
HRdataset = TensorDataset(X, Y) # Create HR Dataset
class Model(nn.Module):
def __init__(self):
super().__init__() #继承父类的所有属性
self.linear_1 = nn.Linear(20,64) # 输入X有20列,因此有20个特征。 假如中间的隐藏层有64个特征,即创建64个特征的中间层
self.linear_2 = nn.Linear(64,64) # 第2层的输入就是第1层的输出,第一层隐藏层有64个特征,因此第2层的输入特征为64,中间的隐藏层还是64
self.linear_3 = nn.Linear(64,1) # 第3层的输入是第2层的输出,还是64 . 输出用于逻辑回归,因此输出特征是1,二分类
def forward(self,input): # forward函数去调用这些层
x = F.relu(self.linear_1(input) ) # 第一层调用
x = F.relu(self.linear_2(x))
x = F.sigmoid(self.linear_3(x))
return x
model = Model()
lr = 0.0001
def get_model():
model = Model()
opt = torch.optim.Adam(model.parameters(),lr=lr) # 优化方法
return model, opt
model,opt = get_model()
loss_fn = nn.BCELoss()
batch = 64
no_of_batches = len(data)//batch
epochs = 100
for epoch in range(epochs):
for i in range(no_of_batches):
x, y = HRdataset[i * batch: i * batch + batch]
y_pred = model(x)
loss = loss_fn(y_pred, y)
opt.zero_grad()
loss.backward()
opt.step()
with torch.no_grad():
print('epoch:', epoch, ' ', 'loss:', loss_fn(model(X), Y))
6. 使用dataloader进行重构
# 使用dataloader进行重构
# Pytorch DataLoader负责管理批次。
# DataLoader从Dataset创建。
# DataLoader使遍历批次变得更容易。DataLoader会自动为我们提供每个小批量。
# 无需使用 HRdataset[i * batch: i * batch + batch]
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import pandas as pd
data = pd.read_csv('./HR.csv')
data = data.join(pd.get_dummies(data.salary)) # 将分类后的3列join到data数据集
del data['salary'] # 删除原有的salary这一列
data = data.join(pd.get_dummies(data.part)) # 同样的方法处理part这一列
del data['part']
Y_data = data.left.values.reshape(-1,1) # 第2维是1,第1维自动计算
Y = torch.from_numpy(Y_data).type(torch.FloatTensor)
X_data = data[[c for c in data.columns if c != 'left']].values #双中括号取值
X = torch.from_numpy(X_data).type(torch.FloatTensor)
class Model(nn.Module):
def __init__(self):
super().__init__() #继承父类的所有属性
self.linear_1 = nn.Linear(20,64) # 输入X有20列,因此有20个特征。 假如中间的隐藏层有64个特征,即创建64个特征的中间层
self.linear_2 = nn.Linear(64,64) # 第2层的输入就是第1层的输出,第一层隐藏层有64个特征,因此第2层的输入特征为64,中间的隐藏层还是64
self.linear_3 = nn.Linear(64,1) # 第3层的输入是第2层的输出,还是64 . 输出用于逻辑回归,因此输出特征是1,二分类
def forward(self,input): # forward函数去调用这些层
x = F.relu(self.linear_1(input) ) # 第一层调用
x = F.relu(self.linear_2(x))
x = F.sigmoid(self.linear_3(x))
return x
model = Model()
lr = 0.0001
def get_model():
model = Model()
opt = torch.optim.Adam(model.parameters(),lr=lr) # 优化方法
return model, opt
model,opt = get_model()
loss_fn = nn.BCELoss()
batch = 64
no_of_batches = len(data)//batch
epochs = 100
# 使用dataloader
HR_ds = TensorDataset(X, Y)
# HR_dl = DataLoader(HR_ds, batch_size=batch, shuffle=True) # 打乱数据集,有利于提升loss
HR_dl = DataLoader(HR_ds, batch_size=batch, shuffle=False)
for epoch in range(epochs):
for x, y in HR_dl:
y_pred = model(x)
loss = loss_fn(y_pred, y)
opt.zero_grad()
loss.backward()
opt.step()
with torch.no_grad():
print('epoch:', epoch, ' ', 'loss:', loss_fn(model(X), Y))
# 使用dataset方式,最后损失 epoch: 99 loss: tensor(0.5242)
# 使用dataloader+shuffle=False,最后损失 epoch: 99 loss: tensor(0.4920)
# 使用dataloader+shuffle=True,最后损失 epoch: 99 loss: tensor(0.2838)
网友评论