一、整体思路
整体思路很简单,就是读入数据集建立好词典后使用卷积和全连接得到最后的分类结果。
具体流程:
- 读入数据,并对数据进行清洗(数据集按好评、中评和差评分成三类,已经用 jieba分好词)
- 建立词典
- 建立分类模型(主要就是卷积和全连接层)
- 训练得到结果
下面的代码主要来自ChenZhongFu 大佬
二、前期处理
1. 数据清洗
主要就是用正则去除一些奇奇怪怪的符号,另外截取文本最大长度64
#数据清理
def clean(sent):
punctuation_remove = u'[、:,?!。;……()『』《》【】~!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+'
sent = re.sub(r'ldquo', "", sent)
sent = re.sub(r'hellip', "", sent)
sent = re.sub(r'rdquo', "", sent)
sent = re.sub(r'yen', "", sent)
sent = re.sub(r'⑦', "7", sent)
sent = re.sub(r'(, ){2,}', "", sent)
sent = re.sub(r'(! ){2,}', "", sent) # delete too many!,?,。等
sent = re.sub(r'(? ){2,}', "", sent)
sent = re.sub(r'(。 ){2,}', "", sent)
# sent=sent.split()
# sent_no_pun=[]
# for word in sent:
# if(word!=(','or'。'or'?'or'!'or':'or';'or'('or')'or'『'or'』'or'《'or'》'or'【'or'】'or'~'or'!'or'\"'or'\''or'?'or','or'.')):
# sent_no_pun.append(word)
# s=' '.join(sent_no_pun)
sent = re.sub(punctuation_remove, "", sent) # delete punctuations
#若长度大于64,则截取前64个长度
if(len(sent.split())>MAX_LEN):
s=' '.join(sent.split()[:MAX_LEN])
else:
s = ' '.join(sent.split()) # delete additional space
return s
2. 获取和处理数据
依次读入和清理数据,然后制作词典,最后得到用词典序号表示的文章
#获取数据,并转换为id
def get_data():
good_data=open('./data/good_cut_jieba.txt','r',encoding='utf-8').readlines()
good_data=[clean(line).replace('\n','') for line in good_data]
good_data_label=[0 for i in range(len(good_data))]
bad_data = open('./data/bad_cut_jieba.txt', 'r', encoding='utf-8').readlines()
bad_data = [clean(line).replace('\n', '') for line in bad_data]
bad_data_label = [1 for i in range(len(bad_data))]
mid_data = open('./data/mid_cut_jieba.txt', 'r', encoding='utf-8').readlines()
mid_data = [clean(line).replace('\n', '') for line in mid_data]
mid_data_label = [2 for i in range(len(mid_data))]
data = good_data + bad_data + mid_data
data_label = good_data_label + bad_data_label + mid_data_label
# 得到所有的字,建立词典
vocab=[word for s in data for word in s.split()]
vocab=set(vocab)
# 对词典进行编号,将每个字对应成序号
for word in vocab:
inx_to_word[len(word_to_inx)]=word
word_to_inx[word]=len(word_to_inx)
# 将本文转化成序号表示的文本
data_id=[]
for s in data:
s_id=[]
for word in s.split():
s_id.append(word_to_inx[word])
s_id=s_id+[0]*(MAX_LEN-len(s_id))
data_id.append(s_id)
return data_id, data_label, word_to_inx, inx_to_word
这一部分只是简单地调用get_data函数,并使用train_test_split函数分割数据集,最后将得到的数据转化成LongTensor类型
#将数据转化为tensor
def tensorFromData():
# 得到用序号表示的文本和其对应的类别
data_id, data_lable, _, _ = get_data()
# 使用train_test_split分割数据集
data_id_train, data_id_test, data_label_train, data_label_test = train_test_split(data_id, data_lable,
test_size=0.2,
random_state=20180127)
# 转化成LongTensor类型
data_id_train = torch.LongTensor类型(data_id_train)
data_id_test = torch.LongTensor(data_id_test)
data_label_train = torch.LongTensor(data_label_train)
data_label_test = torch.LongTensor(data_label_test)
return data_id_train, data_id_test, data_label_train, data_label_test
文本读入的类,基本没干什么,定义这个类的目的主要是为了使用pytorch自带的DataLoader函数
class TextDataSet(Dataset):
def __init__(self,inputs,outputs):
self.inputs = inputs
self.outputs = outputs
def __len__(self):
return len(self.inputs)
def __getitem__(self, index):
return self.inputs[index], self.outputs[index]
3.具体使用
# 将数据划分为训练集和测试集
X_train, X_test, Y_train, Y_test = data_preprocess.tensorFromData()
trainDataSet = data_preprocess.TextDataSet(X_train, Y_train)
testDataSet = data_preprocess.TextDataSet(X_test, Y_test)
trainDataLoader = DataLoader(trainDataSet, batch_size=16, shuffle=True)
testDataLoader = DataLoader(testDataSet, batch_size=16, shuffle=False)
三、模型构建
1. 卷积
首先需要对1D卷积做一下了解:
m = nn.Conv1d(16, 33, 3, stride=2)
input = torch.autograd.Variable(torch.randn(20, 16, 5))
output = m(input)
output.shape
>> torch.Size([20, 33, 2])
2. 模型定义
# 定义模型
class CNN_model(nn.Module):
def __init__(self, len_dic,input_dim,emb_dim):
super(CNN_model, self).__init__()
self.embed = nn.Embedding(len_dic, emb_dim) # batchsize,64(序列长),128(词向量长度)
self.conv = nn.Sequential(
nn.Conv1d(input_dim, 256, kernel_size=3, padding=1), # batchsize,256,128
nn.MaxPool1d(2, 2), # batchsize,256,64
nn.Conv1d(256, 128, kernel_size=3, padding=1), # batchsize,128,64
nn.MaxPool1d(2, 2), # batchsize,128,32
nn.Conv1d(128, 64, kernel_size=3, padding=1), # batchsize,64,32
nn.MaxPool1d(2, 2), # batchsize,64,16
)
self.bn=nn.BatchNorm1d(64)#batchsize,64,16 -> #batchsize,64*16
self.drop = nn.Dropout(0.1)
self.linear = nn.Linear(64 * 16, 256)#batchsize,256
self.relu = nn.ReLU(True)#batchsize,256
self.classify = nn.Linear(256, 3)#batchsize,3
def forward(self, x):
x = self.embed(x)
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = self.bn(x)
b, c, l = x.size()
x = x.view(b, c * l)
x = self.drop(x)
x = self.linear(x)
x = self.relu(x)
x=x.view(-1,256)
out = self.classify(x)
out = F.log_softmax(out)
return out
四、训练过程
for epoch in range(num_epoches):
train_loss = 0
train_acc = 0
model.train()
for i, data in enumerate(trainDataLoader):
x, y = data
if use_cuda:
x, y = Variable(x).cuda(), Variable(y).cuda()
else:
x, y = Variable(x), Variable(y)
# forward
out = model(x)
# criterion = nn.NLLLoss() 用于多分类的负对数似然损失函数
loss = criterion(out, y)
train_loss += loss.i * len(y)
train_loss += loss.item() * len(y)
# 得到可能性最大的一个类别
_, pre = torch.max(out, 1)
# 获得准确率
num_acc = (pre == y).sum()
train_acc += num_acc.item()
# backward
optimzier.zero_grad()
loss.backward()
optimzier.step()
if (i + 1) % 100 == 0:
print('[{}/{}],train loss is:{:.6f},train acc is:{:.6f}'.format(i, len(trainDataLoader),
train_loss / (i * batch_size),
train_acc / (i * batch_size)))
print(
'epoch:[{}],train loss is:{:.6f},train acc is:{:.6f}'.format(epoch,
train_loss / (len(trainDataLoader) * batch_size),
train_acc / (len(trainDataLoader) * batch_size)))
model.eval()
eval_loss = 0
eval_acc = 0
for i, data in enumerate(testDataLoader):
x, y = data
if use_cuda:
x = Variable(x, volatile=True).cuda()
y = Variable(y, volatile=True).cuda()
else:
x = Variable(x, volatile=True)
y = Variable(y, volatile=True)
out = model(x)
loss = criterion(out, y)
eval_loss += loss.data[0] * len(y)
_, pre = torch.max(out, 1)
num_acc = (pre == y).sum()
eval_acc += num_acc.data[0]
print('test loss is:{:.6f},test acc is:{:.6f}'.format(
eval_loss / (len(testDataLoader) * batch_size),
eval_acc / (len(testDataLoader) * batch_size)))
if best_acc<(eval_acc / (len(testDataLoader) * batch_size)):
best_acc=eval_acc / (len(testDataLoader) * batch_size)
best_model=model.state_dict()
print('best acc is {:.6f},best model is changed'.format(best_acc))
网友评论