美文网首页
word2vec 从抽提特征到模型训练

word2vec 从抽提特征到模型训练

作者: 枫隐_5f5f | 来源:发表于2019-05-08 16:41 被阅读0次

    Kaggle数据集合进行情感偏好性分析
    主要涉及将自然语言转换成词向量作为特征用于模型参数训练

    import os
    import  re
    import numpy as np
    import  pandas as pd
    import sys
    from bs4 import BeautifulSoup
    from gensim.models.word2vec import  Word2Vec
    from sklearn.ensemble import  RandomForestClassifier
    from sklearn.metrics import  confusion_matrix
    from sklearn.cluster import  KMeans
    import  nltk.data
    import  warnings
    warnings.filterwarnings("ignore")
    
    
    def load_dataset(name,nrows=None):
        dataset = {"unlabeled_train":"unlabeledTrainData.tsv",
                   "labeled_train":"labeledTrainData.tsv",
                   "test":"testData.tsv"}
    
        if name not in dataset:
            raise  ValueError
        data_file  = dataset[name]
        df = pd.read_csv(data_file,sep="\t",escapechar="\\")
        return  df
    
    
    
    def clean_text(text,remove_stopwords=False):
        text = BeautifulSoup(text,"html.parser").get_text()
        text = re.sub(r"[^a-zA-Z]"," ",text)
        words = text.lower().split()
        if remove_stopwords:
            eng_stopwords = {}.fromkeys([line.strip() for line in open("stopwords.txt", "r")])
            words = [w for w in words if w not in eng_stopwords]
        return  words
    
    def split_sentences(review):
        # 使用nltk分词器将段落分成句子
        tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
        raw_sentences = tokenizer.tokenize(review.strip())
        sentences = [clean_text(s) for s in raw_sentences if s]
        return  sentences
    
    def training():
        df = load_dataset('unlabeled_train')
        sentences = sum(df.review[0:1000].apply(split_sentences), [])
    
        # 设定词向量训练的参数
        num_features = 300    # Word vector dimensionality
        min_word_count = 40   # Minimum word count
        num_workers = 4       # Number of threads to run in parallel
        context = 10          # Context window size
        downsampling = 1e-3   # Downsample setting for frequent words
        model_name = '{}features_{}minwords_{}context.model'.format(num_features, min_word_count, context)
    
        #训练模型
        model = Word2Vec(sentences,workers=num_workers,size=num_features,min_count=min_word_count,window=context,sample=downsampling)
        model.init_sims(replace=True)
        model.save(model_name)
    
    
    #加载模型
    model_name = '300features_40minwords_10context.model'
    model = Word2Vec.load(model_name)
    
    df = load_dataset("labeled_train")
    
    def review_to_vector(review):
        words = clean_text(review,remove_stopwords=True)
        array = np.array([model[w] for w in words if w in model])
        return pd.Series(array.mean(axis=0))
    
    train_data_features = df.review.apply(review_to_vector)
    
    forest = RandomForestClassifier(n_estimators=100,random_state=42)
    fores = forest.fit(train_data_features,df.sentiment)
    
    df = load_dataset("test")
    test_data_feature = df.review.apply(review_to_vector)
    results = forest.predict(test_data_feature)
    output = pd.DataFrame({"id":df.id,"sentiment":results})
    

    相关文章

      网友评论

          本文标题:word2vec 从抽提特征到模型训练

          本文链接:https://www.haomeiwen.com/subject/ralvoqtx.html