Kaggle数据集合进行情感偏好性分析
主要涉及将自然语言转换成词向量作为特征用于模型参数训练
import os
import re
import numpy as np
import pandas as pd
import sys
from bs4 import BeautifulSoup
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
import nltk.data
import warnings
warnings.filterwarnings("ignore")
def load_dataset(name,nrows=None):
dataset = {"unlabeled_train":"unlabeledTrainData.tsv",
"labeled_train":"labeledTrainData.tsv",
"test":"testData.tsv"}
if name not in dataset:
raise ValueError
data_file = dataset[name]
df = pd.read_csv(data_file,sep="\t",escapechar="\\")
return df
def clean_text(text,remove_stopwords=False):
text = BeautifulSoup(text,"html.parser").get_text()
text = re.sub(r"[^a-zA-Z]"," ",text)
words = text.lower().split()
if remove_stopwords:
eng_stopwords = {}.fromkeys([line.strip() for line in open("stopwords.txt", "r")])
words = [w for w in words if w not in eng_stopwords]
return words
def split_sentences(review):
# 使用nltk分词器将段落分成句子
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
raw_sentences = tokenizer.tokenize(review.strip())
sentences = [clean_text(s) for s in raw_sentences if s]
return sentences
def training():
df = load_dataset('unlabeled_train')
sentences = sum(df.review[0:1000].apply(split_sentences), [])
# 设定词向量训练的参数
num_features = 300 # Word vector dimensionality
min_word_count = 40 # Minimum word count
num_workers = 4 # Number of threads to run in parallel
context = 10 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)
#训练模型
model = Word2Vec(sentences,workers=num_workers,size=num_features,min_count=min_word_count,window=context,sample=downsampling)
model.init_sims(replace=True)
model.save(model_name)
#加载模型
model_name = '300features_40minwords_10context.model'
model = Word2Vec.load(model_name)
df = load_dataset("labeled_train")
def review_to_vector(review):
words = clean_text(review,remove_stopwords=True)
array = np.array([model[w] for w in words if w in model])
return pd.Series(array.mean(axis=0))
train_data_features = df.review.apply(review_to_vector)
forest = RandomForestClassifier(n_estimators=100,random_state=42)
fores = forest.fit(train_data_features,df.sentiment)
df = load_dataset("test")
test_data_feature = df.review.apply(review_to_vector)
results = forest.predict(test_data_feature)
output = pd.DataFrame({"id":df.id,"sentiment":results})
网友评论