基于LSTM的文本分类

作者: 还闹不闹 | 来源:发表于2020-06-20 18:24 被阅读0次

2018-11-10
基于LSTM的文本分类
多分类
基于LSTM的文本分类模型
从代码学AI ——情感分类(LSTM)
基于Bi-LSTM的文本情感分类
Python篇—文本分类
keras 例子
一种使用快速、简单有效的文本分类方法
基于LSTM三分类的文本情感分析

#!usr/bin/python
# coding=utf-8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
# ## 设置字体
# from matplotlib.font_manager import FontProperties
# # fonts = FontProperties(fname = "/Library/Fonts/华文细黑.ttf",size=14)
# # %config InlineBackend.figure_format = 'retina'
# %matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
# 画图支持中文显示
from pylab import *
matplotlib.rcParams['font.sans-serif'] = ['SimHei']

# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
# 设置value的显示长度为10000，默认为50
pd.set_option('display.width',10000)
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
#
np.set_printoptions(linewidth=1000)

## 读取测数据集
train_df = pd.read_csv("G:\\rnn\lstm\cnews-LSTM\cnews_train.csv")
val_df = pd.read_csv("G:\\rnn\lstm\cnews-LSTM\cnews_val.csv")
test_df = pd.read_csv("G:\\rnn\lstm\cnews-LSTM\cnews_test.csv")
# print(train_df.head())
# print(val_df.head())
# print(test_df.head())
print(train_df.iloc[1:2,[2]])
print(type(train_df.iloc[1:2,[2]]))
# print(len((train_df.iloc[1:2,[2]]).values), (train_df.iloc[1:2,[2]]).values)
print((train_df.iloc[1:2,[2]]).values.tolist())
print(len(set(((train_df.iloc[1:2,[2]]).values.tolist())[0][0])))
print(train_df.iloc[1:2,[3]]) # 第4列为对应词组的个数
# -------------------------------------------------------------
a = list(filter(None, (((train_df.iloc[0:1,[2]]).values.tolist())[0][0]).split(" ")))
b = list(filter(None, (((train_df.iloc[1:2,[2]]).values.tolist())[0][0]).split(" ")))
print(len(a), len(b), a, b)
a.extend(b)
print(a)
print(len(a), len(set(a)))
# 该数据集已经进行了处理，每个数据集包含4列数据，其中第一列为标签数据，第二列为新闻的原文数据，第三列为经过分词、去停用词等操作，并使用空格连接的分词后数据，第4列为对应词组的个数。

# 数据探索：查看训练集都有哪些标签
plt.figure()
sns.countplot(train_df.label)
# plt.xlabel('Label',fontproperties = fonts,size = 10)
# plt.xticks(fontproperties = fonts,size = 10)
plt.xlabel('Label',size = 10)
plt.xticks(size = 10)
plt.show()
# 分析训练集中词组数量的分布
print(train_df.cutwordnum.describe())
plt.figure()
plt.hist(train_df.cutwordnum,bins=100)
# plt.xlabel("词组长度",fontproperties = fonts,size = 12)
# plt.ylabel("频数",fontproperties = fonts,size = 12)
# plt.title("训练数据集",fontproperties = fonts)
plt.xlabel("词组长度",size = 12)
plt.ylabel("频数",size = 12)
plt.title("训练数据集")
plt.show()

# 接下来对数据集的标签数据进行编码，首先是LabelEncoder()编码，然后是进行OneHotEncoder()编码。-------------------------------编码标签
# 对数据集的标签数据进行编码
train_y = train_df.label
val_y = val_df.label
test_y = test_df.label
le = LabelEncoder()
train_y = le.fit_transform(train_y).reshape(-1,1)
val_y = le.transform(val_y).reshape(-1,1)
test_y = le.transform(test_y).reshape(-1,1)
# 对数据集的标签数据进行one-hot编码
ohe = OneHotEncoder()
train_y = ohe.fit_transform(train_y).toarray()
val_y = ohe.transform(val_y).toarray()
test_y = ohe.transform(test_y).toarray()

# 使用Tokenizer对词组进行编码--------------------------------------------------------------------------------------------编码文本
# 当我们创建了一个Tokenizer对象后，使用该对象的fit_on_texts()函数，以空格去识别每个词,
# 可以将输入的文本中的每个词编号，编号是根据词频的，词频越大，编号越小。
max_words = 5000
max_len = 600
tok = Tokenizer(num_words=max_words)  ## 使用的最大词语数为5000
tok.fit_on_texts(train_df.cutword)
# 使用word_index属性可以看到每次词对应的编码
# 使用word_counts属性可以看到每个词对应的频数
for ii,iterm in enumerate(tok.word_index.items()):
    if ii < 10:
        print(iterm)
    else:
        break
print("===================")
for ii,iterm in enumerate(tok.word_counts.items()):
    if ii < 10:
        print(iterm)
    else:
        break

# 使用tok.texts_to_sequences()将数据转化为序列，并使用sequence.pad_sequences()将每个序列调整为相同的长度
# 对每个词编码之后，每句新闻中的每个词就可以用对应的编码表示，即每条新闻可以转变成一个向量了：
train_seq = tok.texts_to_sequences(train_df.cutword)
val_seq = tok.texts_to_sequences(val_df.cutword)
test_seq = tok.texts_to_sequences(test_df.cutword)
## 将每个序列调整为相同的长度
train_seq_mat = sequence.pad_sequences(train_seq,maxlen=max_len)
val_seq_mat = sequence.pad_sequences(val_seq,maxlen=max_len)
test_seq_mat = sequence.pad_sequences(test_seq,maxlen=max_len)

print(train_seq_mat.shape)
print(val_seq_mat.shape)
print(test_seq_mat.shape)

# 定义LSTM模型
inputs = Input(name='inputs',shape=[max_len])
## Embedding(词汇表大小,batch大小,每个新闻的词长)
layer = Embedding(max_words+1,128,input_length=max_len)(inputs)
# layer = LSTM(128)(layer)
layer = LSTM(8)(layer)
layer = Dense(128,activation="relu",name="FC1")(layer)
layer = Dropout(0.5)(layer)
layer = Dense(10,activation="softmax",name="FC2")(layer)
model = Model(inputs=inputs,outputs=layer)
model.summary()
model.compile(loss="categorical_crossentropy",optimizer=RMSprop(),metrics=["accuracy"])

# 模型训练
# model_fit = model.fit(train_seq_mat,train_y,batch_size=128,epochs=10,
#                       validation_data=(val_seq_mat,val_y),
#                       callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)] ## 当val-loss不再提升时停止训练
#                      )
model_fit = model.fit(train_seq_mat,train_y,batch_size=128,epochs=1,
                      validation_data=(val_seq_mat,val_y),
                      callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)] ## 当val-loss不再提升时停止训练
                     )

# -----------------------------------------------------------------------------------------------------------------
# 对测试集进行预测
test_pre = model.predict(test_seq_mat)

## 评价预测效果，计算混淆矩阵
confm = metrics.confusion_matrix(np.argmax(test_pre,axis=1),np.argmax(test_y,axis=1))
## 混淆矩阵可视化
Labname = ["体育","娱乐","家居","房产","教育","时尚","时政","游戏","科技","财经"]
plt.figure(figsize=(8,8))
sns.heatmap(confm.T, square=True, annot=True,
            fmt='d', cbar=False,linewidths=.8,
            cmap="YlGnBu")
plt.xlabel('True label',size = 14)
plt.ylabel('Predicted label',size = 14)
plt.xticks(np.arange(10)+0.5,Labname,fontproperties = fonts,size = 12)
plt.yticks(np.arange(10)+0.3,Labname,fontproperties = fonts,size = 12)
plt.show()

print(metrics.classification_report(np.argmax(test_pre,axis=1),np.argmax(test_y,axis=1)))

参考：
https://www.cnblogs.com/BobHuang/p/11157489.html
https://kexue.fm/archives/3414
https://zhuanlan.zhihu.com/p/39884984
https://www.jianshu.com/p/caee648f6a1f
https://zhuanlan.zhihu.com/p/50657430

2018-11-10
LSTM的应用文本分类LSTM文本分类：使用LSTM的最后一个状态文本分类中不用one-hot编码，使用emb...
基于LSTM的文本分类
参考：https://www.cnblogs.com/BobHuang/p/11157489.htmlhttps:...
多分类
基于LSTM的中文文本多分类实战sklearn 多种分类实现支持向量机1支持向量机2 随机森林理论内容
基于LSTM的文本分类模型
参考网站： https://www.jianshu.com/p/c1cd2d127ae2
从代码学AI ——情感分类(LSTM)
前言本篇文章会从代码的角度说明如何基于TFlearn使用LSTM进行文本的情感分类。如果对于TFLearn和LS...
基于Bi-LSTM的文本情感分类
参考：一维卷积网络IMDB情感分析双向LSTM情感分类模型利用DNN对EMAIL分类基于Keras构建RNN模型 ...
Python篇—文本分类
1.自然语言整体概览文本分类方法集锦基于text-cnn文本分类基于SVM基于dnn的文本情感分析多类别文本分类...
keras 例子
基于多层感知机的softmax分类 MLP二分类卷积神经网络使用LSTM的序列分类使用ID卷积的序列分类
一种使用快速、简单有效的文本分类方法
问题及需求使用CNN，LSTM，Attention based model建立一个文本分类深度模型固然很好很强大...
基于LSTM三分类的文本情感分析
背景介绍文本情感分析作为NLP的常见任务，具有很高的实际应用价值。本文将采用LSTM模型，训练一个能够识别文本p...

基于LSTM的文本分类

相关文章

2018-11-10

基于LSTM的文本分类

多分类

基于LSTM的文本分类模型

从代码学AI ——情感分类(LSTM)

基于Bi-LSTM的文本情感分类

Python篇—文本分类

keras 例子

一种使用快速、简单有效的文本分类方法

基于LSTM三分类的文本情感分析

网友评论

延伸阅读

深度阅读

栏目导航

热点阅读