神经网络实践之情感分类
最近报名了Udacity的深度学习基石,这是第二周的课程,主要是介绍了运用神经网络进行情感分类,课程中是对英文进行了分类,我这边改为了中文。
首先是中文切词,使用的是结巴。
本文notebook地址是:https://github.com/zhuanxuhit/nd101/blob/master/1.Intro_to_Deep_Learning/3.How_to_Do_Sentiment_Analysis/Words2Vec.ipynb
import jieba
seg = "使用结巴来对中文进行分词"
seg_list = jieba.cut(seg)
print("/ ".join(seg_list)
使用/ 结巴/ 来/ 对/ 中文/ 进行/ 分词
1. 拿什么来做情感分类的依据
我们的一个思路是分别统计在 positive 和 negative 中词出现的次数,然后理论上应该某些词在 positive 和 negative 中出现的此处应该是有倾向的,下面来验证下吧
import pandas as pd
neg=pd.read_excel('data/neg.xls',header=None,index=None)
pos=pd.read_excel('data/pos.xls',header=None,index=None)
pos['mark']=1
neg['mark']=0 #给训练语料贴上标签
pn=pd.concat([pos,neg],ignore_index=True) #合并语料
neglen=len(neg)
poslen=len(pos) #计算语料数目
cw = lambda x: list(jieba.cut(x)) #定义分词函数
pn['words'] = pn[0].apply(cw)
# 随机下
pn = pn.reindex(np.random.permutation(pn.index))
pn.head()
from collections import Counter
import numpy as np
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()
len(pn['words'])
21105
pn['words'][1][:10]
['作者', '真有', '英国人', '严谨', '的', '风格', ',', '提出', '观点', '、']
pn['mark'][1]
1
我们先开始统计每个词的出现次数
for i in range(len(pn['words'])):
if pn['mark'][i] == 1:
for word in pn['words'][i]:
positive_counts[word] += 1
total_counts[word] += 1
else:
for word in pn['words'][i]:
negative_counts[word] += 1
total_counts[word] += 1
positive_counts.most_common(10)
[(',', 63862),
('的', 48811),
('。', 25667),
('了', 14110),
('是', 10775),
('我', 9578),
('很', 8270),
(',', 6682),
(' ', 6354),
('也', 6307)]
negative_counts.most_common(10)
[(',', 42831),
('的', 28859),
('。', 16847),
('了', 13476),
(',', 8462),
('是', 7994),
('我', 7841),
(' ', 7528),
('!', 7084),
('不', 5821)]
pos_neg_ratios = Counter()
for term,cnt in list(total_counts.most_common()):
if(cnt > 100):
pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
pos_neg_ratios[term] = pos_neg_ratio
list(reversed(pos_neg_ratios.most_common()))[0:30]
[('爱玲', 1.9428571428571428),
('篡改', 1.9473684210526316),
('吹捧', 1.9487179487179487),
('于丹', 1.9523809523809523),
('恶劣', 1.9545454545454546),
('切糕', 1.9666666666666666),
('日货', 2.0217391304347827),
('旅馆', 2.022222222222222),
('倒闭', 2.025),
('再也不会', 2.026315789473684),
('太烂', 2.0285714285714285),
('上当', 2.0285714285714285),
('工作日', 2.0294117647058822),
('丹', 2.0412371134020617),
('胡兰成', 2.0689655172413794),
('炒作', 2.075268817204301),
('无耻', 2.0754716981132075),
('不买', 2.099378881987578),
('求己', 2.1025641025641026),
('无良', 2.1052631578947367),
('畅销书', 2.121212121212121),
('最差', 2.122137404580153),
('骗', 2.14),
('要命', 2.1458333333333335),
('烂', 2.1515151515151514),
('抵制', 2.1634980988593155),
('毕淑敏', 2.1666666666666665),
('看不下去', 2.1666666666666665),
('师', 2.1911764705882355),
('退货', 2.191919191919192)]
for word,ratio in pos_neg_ratios.most_common():
if(ratio > 1):
pos_neg_ratios[word] = np.log(ratio)
else:
pos_neg_ratios[word] = -np.log((1 / (ratio+0.01)))
pos_neg_ratios.most_common(10)
[('结局', 3.7954891891721947),
('成长', 3.0002674287193822),
('快乐', 2.968080742223481),
('世界', 2.5416019934645457),
('幸福', 2.5403579543242145),
('感谢', 2.2894558015522528),
('很漂亮', 2.256541154492639),
('漂亮', 2.0762312660495876),
('人生', 2.044102175234527),
('感动', 2.0074680420547466)]
我们会发现一些一些词:好,不错,喜欢等带有感情色彩的词
list(reversed(pos_neg_ratios.most_common()))[0:10]
[('抵制', -2.7025520357679857),
('死机', -2.5163389039584163),
('很差', -2.286055791042835),
('垃圾', -2.1947851588250678),
('失望', -1.7210375431333034),
('差', -1.7188899346366118),
('页', -1.6315365868037763),
('郁闷', -1.5735428334599848),
('根本', -1.5365211256505318),
('后悔', -1.5336737595318182)]
我们现在有了个大致的判断,对于标注为 positive 和 negative 的其评论其评论切词后是会有写写不同,一些词在正评论中出现的评论会比负评论中多
vocab = set(total_counts.keys())
vocab_size = len(vocab)
print(vocab_size)
50884
对词进行编号
现在我们思路是直接对存在的50884个分词进行排号,即一个150884的向量,然后对于每段话都可以用一个150884的向量表示了
layer_0 = np.zeros((1,vocab_size))
word2index = {}
for i,word in enumerate(vocab):
word2index[word] = i
# word2index
def update_input_layer(reviews):
global layer_0
# clear out previous state, reset the layer to be all 0s
layer_0 *= 0
for word in reviews:
layer_0[0][word2index[word]] += 1
update_input_layer(pn['words'][5])
layer_0
array([[ 0., 0., 0., ..., 0., 0., 0.]])
下一步我们就是要构建神经网络了,简单神经网络的构建可以参见如何构建一个简单的神经网络
import time
import sys
import numpy as np
# Let's tweak our network from before to model these phenomena
class SentimentNetwork:
def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
# set our random number generator
# np.random.seed(1)
self.pre_process_data(reviews, labels)
self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
def pre_process_data(self, reviews, labels):
review_vocab = set()
for review in reviews:
for word in review:
review_vocab.add(word)
self.review_vocab = list(review_vocab)
# label_vocab = set()
# for label in labels:
# label_vocab.add(label)
# self.label_vocab = list(label_vocab)
self.review_vocab_size = len(self.review_vocab)
# self.label_vocab_size = len(self.label_vocab)
self.word2index = {}
for i, word in enumerate(self.review_vocab):
self.word2index[word] = i
# self.label2index = {}
# for i, label in enumerate(self.label_vocab):
# self.label2index[label] = i
def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
# Set number of nodes in input, hidden and output layers.
self.input_nodes = input_nodes
self.hidden_nodes = hidden_nodes
self.output_nodes = output_nodes
# Initialize weights
self.weights_0_1 = np.zeros((self.hidden_nodes,self.input_nodes))
self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5,
(self.output_nodes, self.hidden_nodes))
self.learning_rate = learning_rate
self.layer_0 = np.zeros((input_nodes,1))
def update_input_layer(self,review):
# clear out previous state, reset the layer to be all 0s
self.layer_0 *= 0
for word in review:
if(word in self.word2index.keys()):
self.layer_0[self.word2index[word]][0] += 1
# def get_target_for_label(self,label):
# if(label == 'POSITIVE'):
# return 1
# else:
# return 0
def sigmoid(self,x):
return 1 / (1 + np.exp(-x))
def sigmoid_output_2_derivative(self,output):
return output * (1 - output)
def train(self, training_reviews, training_labels):
assert(len(training_reviews) == len(training_labels))
correct_so_far = 0
start = time.time()
for i in range(len(training_reviews)):
review = training_reviews[i]
label = training_labels[i]
#### Implement the forward pass here ####
### Forward pass ###
# Input Layer
self.update_input_layer(review)
layer_0 = self.layer_0
# Hidden layer
layer_1 = self.weights_0_1.dot(self.layer_0)
# Output layer
layer_2 = self.sigmoid(self.weights_1_2.dot(layer_1))
#### Implement the backward pass here ####
### Backward pass ###
# TODO: Output error
layer_2_error = layer_2 - label # Output layer error is the difference between desired target and actual output.
layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)
# TODO: Backpropagated error
layer_1_error = self.weights_1_2.T.dot(layer_2_delta) # errors propagated to the hidden layer
layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error
# TODO: Update the weights
self.weights_1_2 -= layer_2_delta.dot(layer_1.T) * self.learning_rate # update hidden-to-output weights with gradient descent step
self.weights_0_1 -= layer_1_delta.dot(layer_0.T) * self.learning_rate # update input-to-hidden weights with gradient descent step
if(np.abs(layer_2_error) < 0.5):
correct_so_far += 1
reviews_per_second = i / float(time.time() - start)
sys.stdout.write("\rProgress:" + __builtins__.str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + __builtins__.str(reviews_per_second)[0:5] + " #Correct:" + __builtins__.str(correct_so_far) + " #Trained:" + __builtins__.str(i+1) + " Training Accuracy:" + __builtins__.str(correct_so_far * 100 / float(i+1))[:4] + "%")
if(i % 2500 == 0):
print("")
def test(self, testing_reviews, testing_labels):
correct = 0
start = time.time()
for i in range(len(testing_reviews)):
pred = self.run(testing_reviews[i])
if(pred == testing_labels[i]):
correct += 1
reviews_per_second = i / float(time.time() - start)
sys.stdout.write("\rProgress:" + __builtins__.str(100 * i/float(len(testing_reviews)))[:4] \
+ "% Speed(reviews/sec):" + __builtins__.str(reviews_per_second)[0:5] \
+ "% #Correct:" + __builtins__.str(correct) + " #Tested:" + __builtins__.str(i+1) + " Testing Accuracy:" + __builtins__.str(correct * 100 / float(i+1))[:4] + "%")
def run(self, review):
# Input Layer
# print(review)
self.update_input_layer(review)
# print(self.layer_0.shape)
# print(self.weights_0_1.shape)
# print(np.dot(self.weights_0_1,self.layer_0))
# Hidden layer
layer_1 = self.weights_0_1.dot(self.layer_0)
# Output layer
layer_2 = self.sigmoid(self.weights_1_2.dot(layer_1))
# print(layer_2) # 发现一只0.5呢
if(layer_2[0] > 0.5):
return 1
else:
return 0
reviews = pn['words'].values
labels = pn['mark'].values
# 除最后1000个外的数据训练
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.1)
# train the network
# print(len(reviews),len(labels))
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:12.4% Speed(reviews/sec):177.3 #Correct:1499 #Trained:2501 Training Accuracy:59.9%
Progress:24.8% Speed(reviews/sec):169.9 #Correct:3315 #Trained:5001 Training Accuracy:66.2%
Progress:29.6% Speed(reviews/sec):165.2 #Correct:4052 #Trained:5960 Training Accuracy:67.9%
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
<ipython-input-224-2db8cecc49e0> in <module>()
1 # train the network
2 # print(len(reviews),len(labels))
----> 3 mlp.train(reviews[:-1000],labels[:-1000])
<ipython-input-211-d67a17005419> in train(self, training_reviews, training_labels)
117 # TODO: Update the weights
118 self.weights_1_2 -= layer_2_delta.dot(layer_1.T) * self.learning_rate # update hidden-to-output weights with gradient descent step
--> 119 self.weights_0_1 -= layer_1_delta.dot(layer_0.T) * self.learning_rate # update input-to-hidden weights with gradient descent step
120
121 if(np.abs(layer_2_error) < 0.5):
KeyboardInterrupt:
上面的一个问题是执行太慢了,而且准确率不是很高,为什么呢?
我们来分析下,到底是什么导致的?
review_counter = Counter()
for word in reviews[0]:
review_counter[word] += 1
review_counter.most_common(10)
[(',', 9),
('。', 5),
('功能', 2),
('手机', 2),
('内存', 2),
('的', 2),
('用', 2),
('可', 2),
('都', 2),
('了', 2)]
我们可以看到最多的竟然是.,'。'等好多无关的所以一个优化方法就是只记录出现的1,不记录次数,于是有下面的代码:
import time
import sys
import numpy as np
# Let's tweak our network from before to model these phenomena
class SentimentNetwork:
def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
# set our random number generator
# np.random.seed(1)
self.pre_process_data(reviews, labels)
self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
def pre_process_data(self, reviews, labels):
review_vocab = set()
for review in reviews:
for word in review:
review_vocab.add(word)
self.review_vocab = list(review_vocab)
# label_vocab = set()
# for label in labels:
# label_vocab.add(label)
# self.label_vocab = list(label_vocab)
self.review_vocab_size = len(self.review_vocab)
# self.label_vocab_size = len(self.label_vocab)
self.word2index = {}
for i, word in enumerate(self.review_vocab):
self.word2index[word] = i
# self.label2index = {}
# for i, label in enumerate(self.label_vocab):
# self.label2index[label] = i
def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
# Set number of nodes in input, hidden and output layers.
self.input_nodes = input_nodes
self.hidden_nodes = hidden_nodes
self.output_nodes = output_nodes
# Initialize weights
self.weights_0_1 = np.zeros((self.hidden_nodes,self.input_nodes))
self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5,
(self.output_nodes, self.hidden_nodes))
self.learning_rate = learning_rate
self.layer_0 = np.zeros((input_nodes,1))
def update_input_layer(self,review):
# clear out previous state, reset the layer to be all 0s
self.layer_0 *= 0
for word in review:
if(word in self.word2index.keys()):
self.layer_0[self.word2index[word]][0] = 1
# def get_target_for_label(self,label):
# if(label == 'POSITIVE'):
# return 1
# else:
# return 0
def sigmoid(self,x):
return 1 / (1 + np.exp(-x))
def sigmoid_output_2_derivative(self,output):
return output * (1 - output)
def train(self, training_reviews, training_labels):
assert(len(training_reviews) == len(training_labels))
correct_so_far = 0
start = time.time()
for i in range(len(training_reviews)):
review = training_reviews[i]
label = training_labels[i]
#### Implement the forward pass here ####
### Forward pass ###
# Input Layer
self.update_input_layer(review)
layer_0 = self.layer_0
# Hidden layer
layer_1 = self.weights_0_1.dot(self.layer_0)
# Output layer
layer_2 = self.sigmoid(self.weights_1_2.dot(layer_1))
#### Implement the backward pass here ####
### Backward pass ###
# TODO: Output error
layer_2_error = layer_2 - label # Output layer error is the difference between desired target and actual output.
layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)
# TODO: Backpropagated error
layer_1_error = self.weights_1_2.T.dot(layer_2_delta) # errors propagated to the hidden layer
layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error
# TODO: Update the weights
self.weights_1_2 -= layer_2_delta.dot(layer_1.T) * self.learning_rate # update hidden-to-output weights with gradient descent step
self.weights_0_1 -= layer_1_delta.dot(layer_0.T) * self.learning_rate # update input-to-hidden weights with gradient descent step
if(np.abs(layer_2_error) < 0.5):
correct_so_far += 1
reviews_per_second = i / float(time.time() - start)
sys.stdout.write("\rProgress:" + __builtins__.str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + __builtins__.str(reviews_per_second)[0:5] + " #Correct:" + __builtins__.str(correct_so_far) + " #Trained:" + __builtins__.str(i+1) + " Training Accuracy:" + __builtins__.str(correct_so_far * 100 / float(i+1))[:4] + "%")
if(i % 2500 == 0):
print("")
def test(self, testing_reviews, testing_labels):
correct = 0
start = time.time()
for i in range(len(testing_reviews)):
pred = self.run(testing_reviews[i])
if(pred == testing_labels[i]):
correct += 1
reviews_per_second = i / float(time.time() - start)
sys.stdout.write("\rProgress:" + __builtins__.str(100 * i/float(len(testing_reviews)))[:4] \
+ "% Speed(reviews/sec):" + __builtins__.str(reviews_per_second)[0:5] \
+ "% #Correct:" + __builtins__.str(correct) + " #Tested:" + __builtins__.str(i+1) + " Testing Accuracy:" + __builtins__.str(correct * 100 / float(i+1))[:4] + "%")
def run(self, review):
# Input Layer
# print(review)
self.update_input_layer(review)
# print(self.layer_0.shape)
# print(self.weights_0_1.shape)
# print(np.dot(self.weights_0_1,self.layer_0))
# Hidden layer
layer_1 = self.weights_0_1.dot(self.layer_0)
# Output layer
layer_2 = self.sigmoid(self.weights_1_2.dot(layer_1))
# print(layer_2) # 发现一只0.5呢
if(layer_2[0] > 0.5):
return 1
else:
return 0
# 除最后1000个外的数据训练
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.1)
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:12.4% Speed(reviews/sec):169.4 #Correct:1924 #Trained:2501 Training Accuracy:76.9%
Progress:24.8% Speed(reviews/sec):169.3 #Correct:4013 #Trained:5001 Training Accuracy:80.2%
Progress:37.3% Speed(reviews/sec):162.4 #Correct:6148 #Trained:7501 Training Accuracy:81.9%
Progress:49.7% Speed(reviews/sec):161.6 #Correct:8316 #Trained:10001 Training Accuracy:83.1%
Progress:62.1% Speed(reviews/sec):162.1 #Correct:10504 #Trained:12501 Training Accuracy:84.0%
Progress:74.6% Speed(reviews/sec):162.8 #Correct:12684 #Trained:15001 Training Accuracy:84.5%
Progress:87.0% Speed(reviews/sec):165.6 #Correct:14905 #Trained:17501 Training Accuracy:85.1%
Progress:99.4% Speed(reviews/sec):167.5 #Correct:17122 #Trained:20001 Training Accuracy:85.6%
Progress:99.9% Speed(reviews/sec):167.6 #Correct:17210 #Trained:20105 Training Accuracy:85.6%
mlp.test(reviews[-1000:],labels[-1000:])
Progress:99.9% Speed(reviews/sec):1023.% #Correct:881 #Tested:1000 Testing Accuracy:88.1%
加快运算速度
下一步我们要来看下为什么计算的这么慢原因,我们来看下下面的一个例子:
layer_0 = np.zeros(10)
layer_0[4] = 1
layer_0[9] = 1
weights_0_1 = np.random.randn(10,5)
value1 = layer_0.dot(weights_0_1)
value1
array([-0.20849069, -0.69363945, -0.59383309, 0.6525091 , -0.02585029])
indices = [4,9]
layer_1 = np.zeros(5)
for index in indices:
layer_1 += (weights_0_1[index])
layer_1 == value1
array([ True, True, True, True, True], dtype=bool)
因此我们可以通过上面的方式,只需要计算不为1的index就可以了,可以大大减少计算量,于是就有了下面的优化:
# Let's tweak our network from before to model these phenomena
class SentimentNetwork:
def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):
# set our random number generator
np.random.seed(1)
self.pre_process_data(reviews, labels)
self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
def pre_process_data(self, reviews, labels):
review_vocab = set()
for review in reviews:
for word in review:
review_vocab.add(word)
self.review_vocab = list(review_vocab)
# label_vocab = set()
# for label in labels:
# label_vocab.add(label)
# self.label_vocab = list(label_vocab)
self.review_vocab_size = len(self.review_vocab)
# self.label_vocab_size = len(self.label_vocab)
self.word2index = {}
for i, word in enumerate(self.review_vocab):
self.word2index[word] = i
# self.label2index = {}
# for i, label in enumerate(self.label_vocab):
# self.label2index[label] = i
def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
# Set number of nodes in input, hidden and output layers.
self.input_nodes = input_nodes
self.hidden_nodes = hidden_nodes
self.output_nodes = output_nodes
# Initialize weights
self.weights_0_1 = np.zeros((self.hidden_nodes,self.input_nodes))
self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5,
(self.output_nodes, self.hidden_nodes))
self.learning_rate = learning_rate
self.layer_0 = np.zeros((input_nodes,1))
self.layer_1 = np.zeros((hidden_nodes,1))
def update_input_layer(self,review):
# clear out previous state, reset the layer to be all 0s
self.layer_0 *= 0
for word in review:
if(word in self.word2index.keys()):
self.layer_0[self.word2index[word]][0] = 1
# def get_target_for_label(self,label):
# if(label == 'POSITIVE'):
# return 1
# else:
# return 0
def sigmoid(self,x):
return 1 / (1 + np.exp(-x))
def sigmoid_output_2_derivative(self,output):
return output * (1 - output)
def train(self, training_reviews_raw, training_labels):
training_reviews = list()
for review in training_reviews_raw:
indices = set()
for word in review:
if(word in self.word2index.keys()):
indices.add(self.word2index[word])
training_reviews.append(list(indices))
assert(len(training_reviews) == len(training_labels))
correct_so_far = 0
start = time.time()
for i in range(len(training_reviews)):
review = training_reviews[i]
label = training_labels[i]
#### Implement the forward pass here ####
### Forward pass ###
# Input Layer
# self.update_input_layer(review)
# layer_0 = self.layer_0
# Hidden layer
# layer_1 = self.weights_0_1.dot(self.layer_0)
self.layer_1 *= 0
for index in review:
self.layer_1 += self.weights_0_1[:,[index]]
# Output layer
layer_2 = self.sigmoid(self.weights_1_2.dot(self.layer_1))
#### Implement the backward pass here ####
### Backward pass ###
# TODO: Output error
layer_2_error = layer_2 - label # Output layer error is the difference between desired target and actual output.
layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)
# TODO: Backpropagated error
layer_1_error = self.weights_1_2.T.dot(layer_2_delta) # errors propagated to the hidden layer
layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error
# TODO: Update the weights
self.weights_1_2 -= layer_2_delta.dot(self.layer_1.T) * self.learning_rate # update hidden-to-output weights with gradient descent step
# self.weights_0_1 -= layer_1_delta.dot(self.layer_0.T) * self.learning_rate # update input-to-hidden weights with gradient descent step
for index in review:
self.weights_0_1[:,[index]] -= layer_1_delta * self.learning_rate
if(np.abs(layer_2_error) < 0.5):
correct_so_far += 1
reviews_per_second = i / float(time.time() - start)
sys.stdout.write("\rProgress:" + __builtins__.str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + __builtins__.str(reviews_per_second)[0:5] + " #Correct:" + __builtins__.str(correct_so_far) + " #Trained:" + __builtins__.str(i+1) + " Training Accuracy:" + __builtins__.str(correct_so_far * 100 / float(i+1))[:4] + "%")
if(i % 2500 == 0):
print("")
def test(self, testing_reviews, testing_labels):
correct = 0
start = time.time()
for i in range(len(testing_reviews)):
pred = self.run(testing_reviews[i])
if(pred == testing_labels[i]):
correct += 1
reviews_per_second = i / float(time.time() - start)
sys.stdout.write("\rProgress:" + __builtins__.str(100 * i/float(len(testing_reviews)))[:4] \
+ "% Speed(reviews/sec):" + __builtins__.str(reviews_per_second)[0:5] \
+ "% #Correct:" + __builtins__.str(correct) + " #Tested:" + __builtins__.str(i+1) + " Testing Accuracy:" + __builtins__.str(correct * 100 / float(i+1))[:4] + "%")
def run(self, review):
# Input Layer
# print(review)
self.update_input_layer(review)
# print(self.layer_0.shape)
# print(self.weights_0_1.shape)
# print(np.dot(self.weights_0_1,self.layer_0))
# Hidden layer
layer_1 = self.weights_0_1.dot(self.layer_0)
# Output layer
layer_2 = self.sigmoid(self.weights_1_2.dot(layer_1))
# print(layer_2) # 发现一只0.5呢
if(layer_2[0] > 0.5):
return 1
else:
return 0
# 除最后1000个外的数据训练
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.1)
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:12.4% Speed(reviews/sec):785.5 #Correct:1881 #Trained:2501 Training Accuracy:75.2%
Progress:24.8% Speed(reviews/sec):756.7 #Correct:3959 #Trained:5001 Training Accuracy:79.1%
Progress:37.3% Speed(reviews/sec):700.4 #Correct:6077 #Trained:7501 Training Accuracy:81.0%
Progress:49.7% Speed(reviews/sec):694.7 #Correct:8238 #Trained:10001 Training Accuracy:82.3%
Progress:62.1% Speed(reviews/sec):700.1 #Correct:10431 #Trained:12501 Training Accuracy:83.4%
Progress:74.6% Speed(reviews/sec):701.2 #Correct:12607 #Trained:15001 Training Accuracy:84.0%
Progress:87.0% Speed(reviews/sec):706.2 #Correct:14827 #Trained:17501 Training Accuracy:84.7%
Progress:99.4% Speed(reviews/sec):708.3 #Correct:17034 #Trained:20001 Training Accuracy:85.1%
Progress:99.9% Speed(reviews/sec):708.4 #Correct:17122 #Trained:20105 Training Accuracy:85.1%
a = np.random.normal(0,1,(2,3))
b = np.random.normal(0,1,(2,1))
a
array([[ 0.58281521, -1.10061918, 1.14472371],
[ 0.90159072, 0.50249434, 0.90085595]])
b
array([[-0.68372786],
[-0.12289023]])
a[:,[1]] = b
a
array([[ 0.58281521, -0.68372786, 1.14472371],
[ 0.90159072, -0.12289023, 0.90085595]])
有策略的减少无效单词来减少噪声
优化到这,让我们来看下还有没有什么可优化的了,我们来将分词后正负对比的一个直方图
import pandas as pd
neg=pd.read_excel('data/neg.xls',header=None,index=None)
pos=pd.read_excel('data/pos.xls',header=None,index=None)
pos['mark']=1
neg['mark']=0 #给训练语料贴上标签
pn=pd.concat([pos,neg],ignore_index=True) #合并语料
neglen=len(neg)
poslen=len(pos) #计算语料数目
cw = lambda x: list(jieba.cut(x)) #定义分词函数
pn['words'] = pn[0].apply(cw)
pn = pn.reindex(np.random.permutation(pn.index))
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()
for i in range(len(pn['words'])):
if pn['mark'][i] == 1:
for word in pn['words'][i]:
positive_counts[word] += 1
total_counts[word] += 1
else:
for word in pn['words'][i]:
negative_counts[word] += 1
total_counts[word] += 1
pos_neg_ratios = Counter()
for term,cnt in list(total_counts.most_common()):
if(cnt > 180):
pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
pos_neg_ratios[term] = pos_neg_ratio
for word,ratio in pos_neg_ratios.most_common():
if(ratio > 1):
pos_neg_ratios[word] = np.log(ratio)
else:
pos_neg_ratios[word] = -np.log((1 / (ratio+0.01)))
#
pos_neg_ratios.most_common(10)
[('结局', 3.7954891891721947),
('成长', 3.0002674287193822),
('快乐', 2.968080742223481),
('世界', 2.5416019934645457),
('幸福', 2.5403579543242145),
('感谢', 2.2894558015522528),
('很漂亮', 2.256541154492639),
('漂亮', 2.0762312660495876),
('人生', 2.044102175234527),
('感动', 2.0074680420547466)]
list(reversed(pos_neg_ratios.most_common()))[0:10]
[('抵制', -2.7025520357679857),
('死机', -2.5163389039584163),
('很差', -2.286055791042835),
('垃圾', -2.1947851588250678),
('失望', -1.7210375431333034),
('差', -1.7188899346366118),
('页', -1.6315365868037763),
('郁闷', -1.5735428334599848),
('根本', -1.5365211256505318),
('后悔', -1.5336737595318182)]
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()
hist, edges = np.histogram(list(map(lambda x:x[1],pos_neg_ratios.most_common())), density=True, bins=100, normed=True)
# print(hist,edges)
p = figure(tools="pan,wheel_zoom,reset,save",
toolbar_location="above",
title="Word Positive/Negative Affinity Distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)
Paste_Image.png
frequency_frequency = Counter()
for word, cnt in total_counts.most_common():
frequency_frequency[cnt] += 1
hist, edges = np.histogram(list(map(lambda x:x[1],frequency_frequency.most_common())), density=True, bins=100, normed=True)
p = figure(tools="pan,wheel_zoom,reset,save",
toolbar_location="above",
title="The frequency distribution of the words in our corpus")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)
Paste_Image.png
从上面两个图我们可以知道,大多数词出现的次数其实是不到180的,因此我们完全可以把这些词过滤出去
另外一个就是在词的分布上,我们可以看到对于一些中间的其实其是比较模糊的词,没有很强的对比性,可能在-0.2到0.2之间的词,我们也不做考虑
考虑以上两点,就有了下面的优化
import time
import sys
import numpy as np
# Let's tweak our network from before to model these phenomena
class SentimentNetwork:
def __init__(self, reviews,labels,min_count = 180,polarity_cutoff = 0.1,hidden_nodes = 10, learning_rate = 0.1):
# set our random number generator
np.random.seed(1)
self.pre_process_data(reviews, labels, polarity_cutoff, min_count)
self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
def pre_process_data(self, reviews, labels, polarity_cutoff, min_count):
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()
# 计算出 pos 和 neg 中出现的次数
for i in range(len(reviews)):
if(labels[i] == 1):
for word in reviews[i]:
positive_counts[word] += 1
total_counts[word] += 1
else:
for word in reviews[i]:
negative_counts[word] += 1
total_counts[word] += 1
pos_neg_ratios = Counter()
# 计算单词在 pos 和 neg 中出现的比率
for term,cnt in list(total_counts.most_common()):
if(cnt >= min_count):
pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
pos_neg_ratios[term] = pos_neg_ratio
# 转换数据正规化
for word,ratio in pos_neg_ratios.most_common():
if(ratio > 1):
pos_neg_ratios[word] = np.log(ratio)
else:
pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))
review_vocab = set()
for review in reviews:
for word in review:
if(total_counts[word] > min_count):
if(word in pos_neg_ratios.keys()):
if((pos_neg_ratios[word] >= polarity_cutoff) or (pos_neg_ratios[word] <= -polarity_cutoff)):
review_vocab.add(word)
else:
review_vocab.add(word)
self.review_vocab = list(review_vocab)
# label_vocab = set()
# for label in labels:
# label_vocab.add(label)
# self.label_vocab = list(label_vocab)
self.review_vocab_size = len(self.review_vocab)
# self.label_vocab_size = len(self.label_vocab)
self.word2index = {}
for i, word in enumerate(self.review_vocab):
self.word2index[word] = i
# self.label2index = {}
# for i, label in enumerate(self.label_vocab):
# self.label2index[label] = i
def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
# Set number of nodes in input, hidden and output layers.
self.input_nodes = input_nodes
self.hidden_nodes = hidden_nodes
self.output_nodes = output_nodes
# Initialize weights
self.weights_0_1 = np.zeros((self.hidden_nodes,self.input_nodes))
self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5,
(self.output_nodes, self.hidden_nodes))
self.learning_rate = learning_rate
self.layer_0 = np.zeros((input_nodes,1))
self.layer_1 = np.zeros((hidden_nodes,1))
def update_input_layer(self,review):
# clear out previous state, reset the layer to be all 0s
self.layer_0 *= 0
for word in review:
if(word in self.word2index.keys()):
self.layer_0[self.word2index[word]][0] = 1
# def get_target_for_label(self,label):
# if(label == 'POSITIVE'):
# return 1
# else:
# return 0
def sigmoid(self,x):
return 1 / (1 + np.exp(-x))
def sigmoid_output_2_derivative(self,output):
return output * (1 - output)
def train(self, training_reviews_raw, training_labels):
training_reviews = list()
for review in training_reviews_raw:
indices = set()
for word in review:
if(word in self.word2index.keys()):
indices.add(self.word2index[word])
training_reviews.append(list(indices))
assert(len(training_reviews) == len(training_labels))
correct_so_far = 0
start = time.time()
for i in range(len(training_reviews)):
review = training_reviews[i]
label = training_labels[i]
#### Implement the forward pass here ####
### Forward pass ###
# Input Layer
# self.update_input_layer(review)
# layer_0 = self.layer_0
# Hidden layer
# layer_1 = self.weights_0_1.dot(self.layer_0)
self.layer_1 *= 0
for index in review:
self.layer_1 += self.weights_0_1[:,[index]]
# Output layer
layer_2 = self.sigmoid(self.weights_1_2.dot(self.layer_1))
#### Implement the backward pass here ####
### Backward pass ###
# TODO: Output error
layer_2_error = layer_2 - label # Output layer error is the difference between desired target and actual output.
layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)
# TODO: Backpropagated error
layer_1_error = self.weights_1_2.T.dot(layer_2_delta) # errors propagated to the hidden layer
layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error
# TODO: Update the weights
self.weights_1_2 -= layer_2_delta.dot(self.layer_1.T) * self.learning_rate # update hidden-to-output weights with gradient descent step
# self.weights_0_1 -= layer_1_delta.dot(self.layer_0.T) * self.learning_rate # update input-to-hidden weights with gradient descent step
for index in review:
self.weights_0_1[:,[index]] -= layer_1_delta * self.learning_rate
if(np.abs(layer_2_error) < 0.5):
correct_so_far += 1
reviews_per_second = i / float(time.time() - start)
sys.stdout.write("\rProgress:" + __builtins__.str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + __builtins__.str(reviews_per_second)[0:5] + " #Correct:" + __builtins__.str(correct_so_far) + " #Trained:" + __builtins__.str(i+1) + " Training Accuracy:" + __builtins__.str(correct_so_far * 100 / float(i+1))[:4] + "%")
if(i % 2500 == 0):
print("")
def test(self, testing_reviews, testing_labels):
correct = 0
start = time.time()
for i in range(len(testing_reviews)):
pred = self.run(testing_reviews[i])
if(pred == testing_labels[i]):
correct += 1
reviews_per_second = i / float(time.time() - start)
sys.stdout.write("\rProgress:" + __builtins__.str(100 * i/float(len(testing_reviews)))[:4] \
+ "% Speed(reviews/sec):" + __builtins__.str(reviews_per_second)[0:5] \
+ "% #Correct:" + __builtins__.str(correct) + " #Tested:" + __builtins__.str(i+1) + " Testing Accuracy:" + __builtins__.str(correct * 100 / float(i+1))[:4] + "%")
def run(self, review):
# Input Layer
# print(review)
self.update_input_layer(review)
# print(self.layer_0.shape)
# print(self.weights_0_1.shape)
# print(np.dot(self.weights_0_1,self.layer_0))
# Hidden layer
layer_1 = self.weights_0_1.dot(self.layer_0)
# Output layer
layer_2 = self.sigmoid(self.weights_1_2.dot(layer_1))
# print(layer_2) # 发现一只0.5呢
if(layer_2[0] > 0.5):
return 1
else:
return 0
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=180,polarity_cutoff=0.1,learning_rate=0.1)
mlp.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:12.4% Speed(reviews/sec):1077. #Correct:1865 #Trained:2501 Training Accuracy:74.5%
Progress:24.8% Speed(reviews/sec):1131. #Correct:3854 #Trained:5001 Training Accuracy:77.0%
Progress:37.3% Speed(reviews/sec):1214. #Correct:5898 #Trained:7501 Training Accuracy:78.6%
Progress:49.7% Speed(reviews/sec):1218. #Correct:7972 #Trained:10001 Training Accuracy:79.7%
Progress:62.1% Speed(reviews/sec):1229. #Correct:10040 #Trained:12501 Training Accuracy:80.3%
Progress:74.6% Speed(reviews/sec):1229. #Correct:12111 #Trained:15001 Training Accuracy:80.7%
Progress:87.0% Speed(reviews/sec):1243. #Correct:14210 #Trained:17501 Training Accuracy:81.1%
Progress:99.4% Speed(reviews/sec):1221. #Correct:16314 #Trained:20001 Training Accuracy:81.5%
Progress:99.9% Speed(reviews/sec):1223. #Correct:16397 #Trained:20105 Training Accuracy:81.5%
进行到这,处理速度是快了,但是准确率不高。囧
分析:weights发生了什么
我们来分析下,在不断训练过程中,weight是怎么变化的
一个预测是词义相同的词应该对于最终的贡献是相当的,那weight值也应该是差不多的,让我们来验证下
mlp_full = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=0,polarity_cutoff=0,learning_rate=0.1)
mlp_full.train(reviews[:-1000],labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:12.4% Speed(reviews/sec):729.0 #Correct:1881 #Trained:2501 Training Accuracy:75.2%
Progress:24.8% Speed(reviews/sec):657.7 #Correct:3959 #Trained:5001 Training Accuracy:79.1%
Progress:37.3% Speed(reviews/sec):647.5 #Correct:6077 #Trained:7501 Training Accuracy:81.0%
Progress:49.7% Speed(reviews/sec):638.3 #Correct:8238 #Trained:10001 Training Accuracy:82.3%
Progress:62.1% Speed(reviews/sec):644.2 #Correct:10431 #Trained:12501 Training Accuracy:83.4%
Progress:74.6% Speed(reviews/sec):653.8 #Correct:12607 #Trained:15001 Training Accuracy:84.0%
Progress:87.0% Speed(reviews/sec):659.0 #Correct:14827 #Trained:17501 Training Accuracy:84.7%
Progress:99.4% Speed(reviews/sec):658.7 #Correct:17034 #Trained:20001 Training Accuracy:85.1%
Progress:99.9% Speed(reviews/sec):658.8 #Correct:17122 #Trained:20105 Training Accuracy:85.1%
def get_most_similar_words(focus = "快乐"):
most_similar = Counter()
for word in mlp_full.word2index.keys():
most_similar[word] = np.dot(mlp_full.weights_0_1[:,mlp_full.word2index[word]],mlp_full.weights_0_1[:,mlp_full.word2index[focus]])
return most_similar.most_common()
get_most_similar_words("快乐")[:10]
[('不错', 0.33538066226583924),
('配置', 0.25838511832658073),
('很快', 0.23718560658398344),
('外观', 0.22438845743883634),
('性价比', 0.21692741585717784),
('感动', 0.20675190024284118),
('感谢', 0.20565860660647881),
('很漂亮', 0.20487517481149334),
('光明', 0.20001075298186977),
('漂亮', 0.19736937455766837)]
get_most_similar_words("很差")[:10]
[('失望', 0.85875815559965318),
('差', 0.6900918464910496),
('抵制', 0.64737891767552369),
('很差', 0.61566873849650305),
('不如', 0.59475802794155375),
('垃圾', 0.57265810403945516),
('内容', 0.5178029813713555),
('不', 0.49460849862501899),
('不够', 0.49421096739949072),
('根本', 0.49069603367023529)]
从上面可以看到通过weight相似度还是能很好的区分出相近的词的
最后我们再来看一个图,能更直观的看清楚
import matplotlib.colors as colors
words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
if(word in mlp_full.word2index.keys()):
words_to_visualize.append(word)
for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
if(word in mlp_full.word2index.keys()):
words_to_visualize.append(word)
# len(words_to_visualize)
pos = 0
neg = 0
colors_list = list()
vectors_list = list()
for word in words_to_visualize:
if word in pos_neg_ratios.keys():
vectors_list.append(mlp_full.weights_0_1[:,mlp_full.word2index[word]])
if(pos_neg_ratios[word] > 0):
pos+=1
colors_list.append("#00ff00")
else:
neg+=1
colors_list.append("#000000")
# vectors_list
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)
p = figure(tools="pan,wheel_zoom,reset,save",
toolbar_location="above",
title="vector T-SNE for most polarized words")
source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
x2=words_top_ted_tsne[:,1],
names=words_to_visualize))
p.scatter(x="x1", y="x2", size=8, source=source,color=colors_list)
word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
text_font_size="8pt", text_color="#555555",
source=source, text_align='center')
# p.add_layout(word_labels)
show(p)
Paste_Image.png
上面我们将所有的数据放到二维上,绿色是肯定的,黑色是否定的,通过打开p.add_layout(word_labels)这行代码,我们可以看到距离近的其意思就是相近的
总结
至此就是本篇情感分析的所有了,回顾下:
- 最开始,我们通过分析在不同意见中词出现的次数不同,我们得出了可以根据一段话分词后不同词出现的次数来判断最终的意见,
- 接着我们通过对分词后的词进行编码,将一段话转换为一个向量
- 接着就是构建神经系统了(老套路)
- 下面我们不断去分析怎么能计算的更快,得出可以去掉某些频度太低的词,以及去除一些在正负观点中都出现的,代表性不是那么强的词
- 最后我们分析了训练出来的神经网络的weights的含义,发现可以根据weighs来对词进行分类,相同意见的词自然而然就聚合到一起了
分析下上面的问题,其实在对于词输入上,我们只是简单的进行了编码,没有考虑词之间的前后的位置关系,也没有考虑不同词其实其含义是一样的,下一篇将会使用RNN和word2Vec来进行优化。
网友评论
欢迎订阅《phper进击之路》https://toutiao.io/subjects/75799