首先准备了正负数据用作学习材料:
neg.txt:5331条负面电影评论
pos.txt:5331条正面电影评论
我们把它们 放到了我的csdn中,有兴趣的可以自行下载:http://download.csdn.net/download/qq_34470213/10144755
学习的思路可以总结为:
- 使用nltk来对数据进行处理
- 建立神经网络进行训练
- 检测正确率
具体实现:
1、数据处理
- 使用nltk把数据的每个词都分出来,如:I‘m -- I m
- 进行词形还原,如:cats -- cat
- 统计每个单词的数目
- 数目过多或者过少的单词不进行计算,因为 is a 等字母出现太频繁则对训练结果影响不大
- 满足4条件的单词作为词典,对每条评论进行编码。如满足的单词为:
['woman', 'great', 'feel', 'actually', 'looking', 'latest', 'seen', 'is']
则评论 i think this movie is great 经过编码后的编码为: [0,1,0,0,0,0,0,1],
2、开始训练
- 用步骤1的编码结果作为输入结点,两个输出节点,对数据进行训练。
- 建立全连接层,两个隐藏层,设每个隐藏层的结点数目为1000个,并使用relu激活函数进行去线性化
3、检测正确率
从所有的数据中去后10%进行数据检测,与规定的neg.txt中的Y:[0, 1], pos.txt中的Y:[1, 0],
作为正确结果来统计。
代码展示:
import numpy as np
import tensorflow as tf
import random
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
diary = {"0": "好评", "1": "差评"}
# 创建词汇表
def create_lexicon(pos_file, neg_file):
lex = []
# 读取文件
def process_file(f):
with open(pos_file, 'r') as f:
lex = []
lines = f.readlines()
#print(lines)
for line in lines:
words = word_tokenize(line.lower()) #分词处理,每个词都分出来
lex += words
return lex
lex += process_file(pos_file)
lex += process_file(neg_file)
#print(len(lex))
lemmatizer = WordNetLemmatizer()
lex = [lemmatizer.lemmatize(word) for word in lex] # 词形还原 (cats->cat)
word_count = Counter(lex)
#print(word_count)
# {'.': 13944, ',': 10536, 'the': 10120, 'a': 9444, 'and': 7108, 'of': 6624, 'it': 4748, 'to': 3940......}
# 去掉一些常用词,像the,a and等等,和一些不常用词; 这些词对判断一个评论是正面还是负面没有做任何贡献
lex = []
for word in word_count:
if word_count[word] < 2000 and word_count[word] > 20:
lex.append(word)
return lex
pos_file = 'D:/pos.txt'
neg_file = 'D:/neg.txt'
lex = create_lexicon(pos_file, neg_file)
#lex里保存了文本中出现过的单词。
def string_to_vector(lex, line, clf):
words = word_tokenize(line.lower())
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]
features = np.zeros(len(lex))
for word in words:
if word in lex:
features[lex.index(word)] = 1# 一个句子中某个词可能出现两次,可以用+=1,其实区别不大
if None == clf:
return [features]
else:
return [features, clf]
# 把每条评论转换为向量, 转换原理:
def normalize_dataset(lex):
dataset = []
# lex:词汇表;review:评论;clf:评论对应的分类,[0,1]代表负面评论 [1,0]代表正面评论
i=0
with open(pos_file, 'r') as f:
lines = f.readlines()
for line in lines:
one_sample = string_to_vector(lex, line, [1,0])# [array([ 0., 1., 0., ..., 0., 0., 0.]), [1,0]]
dataset.append(one_sample)
with open(neg_file, 'r') as f:
lines = f.readlines()
for line in lines:
one_sample = string_to_vector(lex, line, [0, 1])# [array([ 0., 0., 0., ..., 0., 0., 0.]), [0,1]]]
dataset.append(one_sample)
#print(len(dataset))
return dataset
dataset = normalize_dataset(lex)
random.shuffle(dataset)
"""
把整理好的数据保存到文件,方便使用。到此完成了数据的整理工作
with open('save.pickle', 'wb') as f:
pickle.dump(dataset, f)
"""
# 取样本中的10%做为测试数据
test_size = int(len(dataset) * 0.1)
dataset = np.array(dataset)
train_dataset = dataset[:-test_size]
test_dataset = dataset[-test_size:]
# Feed-Forward Neural Network
# 定义每个层有多少'神经元''
n_input_layer = len(lex) # 输入层
n_layer_1 = 1000 # hide layer
n_layer_2 = 1000 # hide layer(隐藏层)听着很神秘,其实就是除输入输出层外的中间层
n_output_layer = 2 # 输出层
# 定义待训练的神经网络
def neural_network(data):
# 定义第一层"神经元"的权重和biases
layer_1_w_b = {'w_':tf.Variable(tf.random_normal([n_input_layer, n_layer_1])), 'b_':tf.Variable(tf.random_normal([n_layer_1]))}
# 定义第二层"神经元"的权重和biases
layer_2_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_1, n_layer_2])), 'b_':tf.Variable(tf.random_normal([n_layer_2]))}
# 定义输出层"神经元"的权重和biases
layer_output_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_2, n_output_layer])), 'b_':tf.Variable(tf.random_normal([n_output_layer]))}
# w·x+b
layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
layer_1 = tf.nn.relu(layer_1)# 激活函数
layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])
layer_2 = tf.nn.relu(layer_2 ) # 激活函数
layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_'])
return layer_output
# 每次使用50条数据进行训练
batch_size = 50
X = tf.placeholder('float', [None, len(train_dataset[0][0])])
#[None, len(train_x)]代表数据数据的高和宽(矩阵),好处是如果数据不符合宽高,tensorflow会报错,不指定也可以。
Y = tf.placeholder('float')
# 使用数据训练神经网络
def train_neural_network(X, Y):
predict = neural_network(X)
cost_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=Y, logits=predict))
optimizer = tf.train.AdamOptimizer().minimize(cost_func)
epochs = 12
with tf.Session() as session:
session.run(tf.initialize_all_variables())
random.shuffle(train_dataset)
train_x = dataset[:, 0]
train_y = dataset[:, 1]
for epoch in range(epochs):
epoch_loss = 0
i = 0
while i < len(train_x):
start = i
end = i + batch_size
batch_x = train_x[start:end]
batch_y = train_y[start:end]
_, c = session.run([optimizer, cost_func], feed_dict={X:list(batch_x),Y:list(batch_y)})
epoch_loss += c
i += batch_size
print("第", epoch, '次训练,损失为 : ', epoch_loss)
print("训练完成,正在测试")
text_x = test_dataset[:, 0]
text_y = test_dataset[:, 1]
correct = tf.equal(tf.argmax(predict,1), tf.argmax(Y,1))
accuracy = tf.reduce_mean(tf.cast(correct,'float'))
print('准确率: ', accuracy.eval({X: list(text_x) , Y:list(text_y)}))
def test(valia):
the_vector = string_to_vector(lex, valia, None)
num = tf.argmax(predict,1).eval({X: list(the_vector)})[0]
return num
prompt = "\n请输入评论,输入 'quit' 退出\n"
message = ""
while message != 'quit':
message = input(prompt)
num = test(message)
print(diary[str(num)])
train_neural_network(X, Y)
训练结果可以看出,准确率还是很乐观的
我们继续输入一些数据来使用这个网络:
网友评论