sklearn里集成了一个数据集,里面有20类新闻的文本数据,而朴素贝叶斯算法在文本分类的应用场景中十分实用。今天就用naive_bayes来练习一下文本分类。上码:
#-*- coding:utf-8 -*-
from sklearn.datasets import fetch_20newsgroups
news=fetch_20newsgroups(subset='all')
print(len(news.data))
print(news.data[0])
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(news.data,news.target,test_size=0.25,random_state=33)
#-------CountVectorizer这个模块是用于对文本抽取特征向量
from sklearn.feature_extraction.text import CountVectorizer
vec=CountVectorizer()
X_train=vec.fit_transform(X_train)
X_test=vec.transform(X_test)
from sklearn.naive_bayes import MultinomialNB
NB=MultinomialNB()
NB.fit(X_train,y_train)
y_predict=NB.predict(X_train)
#-------性能分析
print('朴素贝叶斯分类准确率:',NB.score(X_test,y_test))
from sklearn.metrics import classification_report
xnfx=classification_report(y_test,y_predict,target_names=news.target_names)
print(xnfx)
网友评论