美文网首页Python 数据科学笔记
LSI 和 LDA 提取川普推特主题

LSI 和 LDA 提取川普推特主题

作者: Gaius_Yao | 来源:发表于2018-07-29 19:26 被阅读202次

      川普号称“推特治国”,这个数据集收集了川普 2015 年 7 月到 2016 年 11 月七千余条推特的内容,并尝试使用 LSI 和 LDA 提取其主题。

    0 导入包

    # 科学计算
    import numpy as np
    # 数据处理和导入导出
    import pandas as pd
    
    # 数据可视化
    import matplotlib.pyplot as plt
    from matplotlib.cm import get_cmap
    from matplotlib.colors import rgb2hex
    plt.style.use('ggplot') #使用 ggplot 主题
    # 更好的可视化效果
    import seaborn as sns
    sns.set_style("whitegrid") #设置 Seaborn 主题
    # 创建可视化交互图形
    import plotly.graph_objs as go
    from plotly.offline import init_notebook_mode, iplot
    init_notebook_mode(connected=True) #使用 jupyter notebook 模式
    
    # 词云
    from wordcloud import WordCloud
    from imageio import imread #读取图片
    
    # 矢量化文本
    from sklearn.feature_extraction.text import CountVectorizer
    # 相关模型
    from sklearn.decomposition import TruncatedSVD
    from sklearn.decomposition import LatentDirichletAllocation
    from sklearn.decomposition import PCA
    # 可视化高维数据集
    from sklearn.manifold import TSNE
    
    # 使用新的数据类型
    from collections import Counter
    
    # 停用词
    from nltk.corpus import stopwords
    stop = stopwords.words('english')
    

    1 准备数据

      导入数据,并将 Tweet_Text 列为 NaN 的行删除。

    # 导入数据
    df = pd.read_csv('data/Donald-Tweets!.csv', header=0).rename(columns={'twt_favourites_IS_THIS_LIKE_QUESTION_MARK':'Tweet_Like'}).dropna(subset=['Tweet_Text'], axis=0)
    df.info()
    

    <class 'pandas.core.frame.DataFrame'>
    Int64Index: 7375 entries, 0 to 7374
    Data columns (total 12 columns):
    Date 7375 non-null object
    Time 7375 non-null object
    Tweet_Text 7375 non-null object
    Type 7375 non-null object
    Media_Type 1225 non-null object
    Hashtags 2031 non-null object
    Tweet_Id 7375 non-null float64
    Tweet_Url 7375 non-null object
    Tweet_Like 7375 non-null int64
    Retweets 7375 non-null int64
    Unnamed: 10 26 non-null float64
    Unnamed: 11 13 non-null float64
    dtypes: float64(3), int64(2), object(7)
    memory usage: 749.0+ KB

    # 查看 5 条样本数据
    df.sample(5)
    

      通过查看样本数据,发现第 10、11 列没有列名,且所有值均为 NaN,直接将这两列删除。

    # 删除空白的第 10、 11 未命名列
    del df['Unnamed: 10']
    del df['Unnamed: 11']
    
    # 查看是否删除成功
    df.sample()
    

      另外,Tweet_Id 列的所有记录均被转换为了科学计数法,而根据 Tweet 的 URL 命名规则,实际上 URL 最后一部分就是 Tweet_Id。因此,将 Tweet_Url 中的 Id 提取出来并替换 Tweet_Id。

    # 替换 Tweet ID
    df['Tweet_Id'] = df['Tweet_Url'].str[43:]
    
    df.sample()
    

    2 词云

      在进行主题提取前,先来看看川普推特内容的词云和常用词(出现频率最高的单词)。

    # 设置并更新停用词
    stopwords = set(stopwords.words('english'))
    stopwords.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '@', '#', 'rt', 'amp', 'realdonaldtrump', 'http', 'https', '/', '://', '_', 'co', 'trump', 'donald', 'makeamericagreatagain'])
    
    # 设置遮罩
    mask = imread('img/trump.jpg')
    
    # 将 Tweet_Text 中所有内容变成字符串
    twt_text = ''.join(pd.Series(df['Tweet_Text'].tolist()).astype(str)).lower()
    
    type(twt_text)
    

    str

    # 生成词云
    cloud = WordCloud(
            background_color = 'white',
            stopwords = stopwords,
            mask = mask,
            max_words = 1024,
            max_font_size = 100
        )
        
    word_cloud = cloud.generate(twt_text)
    word_cloud.to_file('output\Trump_Cloud.jpg')
    
    plt.figure(figsize=(12,12))
    plt.imshow(word_cloud) 
    plt.axis('off');
    

    3 常用词

    # 矢量化文本
    countVectorizer = CountVectorizer(stop_words=stopwords)
    vectorizedText = countVectorizer.fit_transform(df['Tweet_Text'].str.replace("'", '').values)
    print('Shape Vectorized Text: {}'.format(vectorizedText.shape))
    

    Shape Vectorized Text: (7375, 13690)

    # 设置常用词数量
    n = 20
    
    def nMostFrequentWords(n, countVectorizer, vectorizedText):    
        """
            得出最常见的单词及其出现的次数
        
            Args:
                n: n most frequent words, int
                countVectorizer: CountVectorizer
                vectorizedText: vectorized text, string
                
            Returns:
                words: most frequent words, list
                wordCounts: count word appearences, list   
        """
        # 计算单词在文本中出现的次数
        vectorizedCount = np.sum(vectorizedText, axis=0)
        
        # 获取单词索引和计数
        wordIndices = np.flip(np.argsort(vectorizedCount), 1)
        wordCounts = np.flip(np.sort(vectorizedCount),1)
    
        # 创建单词向量
        wordVectors = np.zeros((n, vectorizedText.shape[1]))
        for i in range(n):
            wordVectors[i, wordIndices[0,i]] = 1
    
        # 逆转换单词向量
        words = [word[0].encode('ascii').decode('utf-8') for word in countVectorizer.inverse_transform(wordVectors)]
    
        # 返回最常见的单词及其出现的次数
        return (words, wordCounts[0, :n].tolist()[0])
    
    words, wordCounts = nMostFrequentWords(n=n, countVectorizer=countVectorizer, vectorizedText=vectorizedText)
    
    # 创建色彩映射
    cmap = get_cmap('viridis')
    colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]
    
    # 生成柱状图
    data = go.Bar(x = words,
                  y = wordCounts,
                  marker = dict(color = colors))
    
    layout = go.Layout(title = 'Most Frequent {} Words In Trump Tweet'.format(n),
                       xaxis = dict(title = 'Words'),
                       yaxis = dict(title = 'Count'))
    
    fig = go.Figure(data=[data], layout=layout)
    iplot(fig)
    

      接下来将使用 LSI 和 LDA 进行主题提取,关于这两个模型的原理和实现,以后有机会再讲述。

    4 LSI

    # 设置主题数
    nTopics = 8
    
    # 创建  LSI 模型
    lsiModel = TruncatedSVD(n_components=nTopics)
    lsiTopicMatrix = lsiModel.fit_transform(vectorizedText)
    print('Shape LSI Topic Matrix: {}'.format(lsiTopicMatrix.shape))
    

    Shape LSI Topic Matrix: (7375, 8)

    # 获取最可能的键和所有类别的计数
    lsiKeys = lsiTopicMatrix.argmax(axis=1)
    lsiCategories, lsiCounts = zip(*Counter(lsiKeys).items())
    
    def getTopWords(n, lsiKeys, vectorizedText, countVectorizer):
        """
            获得主题下的关键词
        
            Args:
                n: n top words, int
                lsiKeys: most probable keys, ndarray
                countVectorizer: CountVectorizer
                vectorizedText: vectorized text, csr_matrix
                
            Returns:
                topWords: top n words, list
        """
        # 为平均值创建零数组
        wordMean = np.zeros((nTopics, vectorizedText.shape[1]))
        # 迭代每个主题
        for i in np.unique(lsiKeys):
            wordMean[i] += vectorizedText.toarray()[lsiKeys==i].mean(axis=0)
            
        # 为每个主题排序并获得最常用的 n 个单词
        topWordsIndices = np.flip(np.argsort(wordMean, axis=1)[:, -n:], axis=1)
        topWordsPercentage = (np.divide(np.flip(np.sort(wordMean, axis=1)[:, -n:], axis=1), (np.sum(wordMean, axis=1)+0.0000001)[:, None])*100).astype(int)
        
         # 存储所有主题的所有单词
        topWords = []
    
        # 使用其索引迭代主题
        for i, (topic, percentage) in enumerate(zip(topWordsIndices, topWordsPercentage)):
            # Store all words for one topic
            topicWords = []
    
            if i in np.unique(lsiKeys):
                # 迭代主题的索引
                for index, percent in zip(topic, percentage):
                    # 为索引创建一个wordvector
                    wordVector = np.zeros((vectorizedText.shape[1]))
                    wordVector[index] = 1
                    # 反变换单词向量
                    word = countVectorizer.inverse_transform(wordVector)[0][0]
                    topicWords.append('{}% '.format(percent) + word.encode('ascii').decode('utf-8'))
            # 存储主题的所有单词
            topWords.append(', '.join(topicWords))
    
        return topWords
    
    topWords = getTopWords(5, lsiKeys, vectorizedText, countVectorizer)
    
    for i, words in enumerate(topWords):
        print('Topic {}: {}'.format(i, words))
    

    Topic 0: 5% great, 4% thank, 1% america, 1% make, 1% trump2016
    Topic 1: 19% hillaryforprison2016, 19% williamesammon1, 19% whereshillary, 19% rueu1ctbz8, 19% sleeping
    Topic 2: 2% hillary, 1% clinton, 1% crooked, 0% president, 0% said
    Topic 3: 1% new, 1% poll, 1% cruz, 1% people, 1% big
    Topic 4: 6% poll, 4% new, 3% trump2016, 2% america, 1% join
    Topic 5: 4% trump2016, 1% people, 1% join, 1% tomorrow, 0% us
    Topic 6: 2% get, 2% america, 2% vote, 1% like, 0% time
    Topic 7: 2% foxnews, 1% cnn, 1% tonight, 1% enjoy, 1% interviewed

    # 排序
    lsiCategoriesSorted, lsiCountsSorted = zip(*sorted(zip(lsiCategories, lsiCounts)))
    
    # 创建标签
    topWords = getTopWords(5, lsiKeys, vectorizedText, countVectorizer)
    labels = ['Topic {}'.format(i) for i in lsiCategoriesSorted]
    
    # 创建色彩映射
    n = nTopics
    cmap = get_cmap('viridis')
    colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]
    
    # 生成柱状图
    data = go.Bar(x = labels,
                  y = lsiCountsSorted,
                  text = [word for word in topWords if word],
                  marker = dict(color = colors))
    
    layout = go.Layout(title = 'Most Frequent LSI Topics In Trump Tweet',
                       xaxis = dict(title = 'Topic'),
                       yaxis = dict(title = 'Count'))
    
    fig = go.Figure(data=[data], layout=layout)
    iplot(fig)
    
    # 降维高维数据集,使其在 2D 中可视化
    tsneModel = TSNE(n_components=2, perplexity=50, learning_rate=100, n_iter=2000, verbose=1, random_state=0, angle=0.75)
    tsneModelVectors = tsneModel.fit_transform(lsiTopicMatrix)
    

    [t-SNE] Computing 151 nearest neighbors...
    [t-SNE] Indexed 7375 samples in 0.006s...
    [t-SNE] Computed neighbors for 7375 samples in 0.605s...
    [t-SNE] Computed conditional probabilities for sample 1000 / 7375
    [t-SNE] Computed conditional probabilities for sample 2000 / 7375
    [t-SNE] Computed conditional probabilities for sample 3000 / 7375
    [t-SNE] Computed conditional probabilities for sample 4000 / 7375
    [t-SNE] Computed conditional probabilities for sample 5000 / 7375
    [t-SNE] Computed conditional probabilities for sample 6000 / 7375
    [t-SNE] Computed conditional probabilities for sample 7000 / 7375
    [t-SNE] Computed conditional probabilities for sample 7375 / 7375
    [t-SNE] Mean sigma: 0.016840
    [t-SNE] KL divergence after 250 iterations with early exaggeration: 65.068428
    [t-SNE] Error after 2000 iterations: 0.829952

    # 创建色彩映射
    n = nTopics
    cmap = get_cmap('tab10')
    colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]
    
    # 获取关键词
    topWords = getTopWords(3, lsiKeys, vectorizedText, countVectorizer)
    
    
    # 生成散点图
    data = []
    # 迭代每个主题
    for topic in range(nTopics):
        mask = lsiKeys==topic
        sample_mask = np.zeros(mask.sum()).astype(bool)
        sample_mask[:int(1000/nTopics)] = True
        np.random.shuffle(sample_mask)
        
        scatter = go.Scatter(x = tsneModelVectors[mask,0][sample_mask],
                             y = tsneModelVectors[mask,1][sample_mask],
                             name = 'Topic {}: {}'.format(topic, topWords[topic]),
                             mode = 'markers',
                             text = df[mask]['Tweet_Text'][sample_mask],
                             marker = dict(color = colors[topic]))
        data.append(scatter)
    
    layout = go.Layout(title = 't-SNE Clustering of {} LSI Topics'.format(nTopics),
                       showlegend=True,
                       hovermode = 'closest')
    
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)
    

    5 LDA

    # 创建 LDA 模型
    ldaModel = LatentDirichletAllocation(n_components=nTopics, learning_method='online', random_state=0, verbose=0)
    ldaTopicMatrix = ldaModel.fit_transform(vectorizedText)
    print('Shape LSI Topic Matrix: {}'.format(ldaTopicMatrix.shape))
    

    Shape LSI Topic Matrix: (7375, 8)

    # 获取最可能的键和所有类别的计数
    ldaKeys = ldaTopicMatrix.argmax(axis=1)
    ldaCategories, ldaCounts = zip(*Counter(ldaKeys).items())
    
    # 获取关键词
    topWords = getTopWords(5, ldaKeys, vectorizedText, countVectorizer)
    
    # 打印主题及其关键词
    for i, words in enumerate(topWords):
        print('Topic {}: {}'.format(i, words))
    

    Topic 0: 1% hillary, 1% clinton, 0% crooked, 0% people, 0% president
    Topic 1: 2% trump2016, 2% great, 1% tonight, 1% thank, 1% enjoy
    Topic 2: 1% great, 0% join, 0% thank, 0% country, 0% tomorrow
    Topic 3: 0% thanks, 0% gop, 0% many, 0% america, 0% hillaryclinton
    Topic 4: 1% hillary, 1% cruz, 0% poll, 0% ted, 0% cnn
    Topic 5: 3% thank, 2% great, 2% new, 1% poll, 1% trump2016
    Topic 6: 4% great, 3% america, 2% make, 1% thank, 0% trump2016
    Topic 7: 1% thank, 1% trump2016, 1% like, 0% great, 0% see

    # 排序
    ldaCategoriesSorted, ldaCountsSorted = zip(*sorted(zip(ldaCategories, ldaCounts)))
    
    # 创建标签
    topWords = getTopWords(5, ldaKeys, vectorizedText, countVectorizer)
    labels = ['Topic {}'.format(i) for i in ldaCategoriesSorted]
    
    # 创建色彩映射
    n = nTopics
    cmap = get_cmap('viridis')
    colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]
    
    # 生成柱状图
    data = go.Bar(x = labels,
                  y = ldaCountsSorted,
                  text = [word for word in topWords if word],
                  marker = dict(color = colors))
    
    layout = go.Layout(title = 'Most Frequent LDA Topics In Trump Tweet',
                       xaxis = dict(title = 'Topic'),
                       yaxis = dict(title = 'Count'))
    
    fig = go.Figure(data=[data], layout=layout)
    iplot(fig)
    
    # 降维高维数据集,使其在 2D 中可视化
    tsneModel = TSNE(n_components=2, perplexity=50, learning_rate=100, n_iter=2000, verbose=1, random_state=0, angle=0.75)
    tsneModelVectors = tsneModel.fit_transform(ldaTopicMatrix)
    

    [t-SNE] Computing 151 nearest neighbors...
    [t-SNE] Indexed 7375 samples in 0.011s...
    [t-SNE] Computed neighbors for 7375 samples in 0.737s...
    [t-SNE] Computed conditional probabilities for sample 1000 / 7375
    [t-SNE] Computed conditional probabilities for sample 2000 / 7375
    [t-SNE] Computed conditional probabilities for sample 3000 / 7375
    [t-SNE] Computed conditional probabilities for sample 4000 / 7375
    [t-SNE] Computed conditional probabilities for sample 5000 / 7375
    [t-SNE] Computed conditional probabilities for sample 6000 / 7375
    [t-SNE] Computed conditional probabilities for sample 7000 / 7375
    [t-SNE] Computed conditional probabilities for sample 7375 / 7375
    [t-SNE] Mean sigma: 0.068215
    [t-SNE] KL divergence after 250 iterations with early exaggeration: 76.528473
    [t-SNE] Error after 2000 iterations: 1.265168

    # 创建色彩映射
    n = nTopics
    cmap = get_cmap('tab10')
    colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]
    
    # 获取关键词
    topWords = getTopWords(3, ldaKeys, vectorizedText, countVectorizer)
    
    # 生成散点图
    data = []
    # 迭代每个主题
    for topic in range(nTopics):
        mask = ldaKeys==topic
        sample_mask = np.zeros(mask.sum()).astype(bool)
        sample_mask[:int(1000/nTopics)] = True
        np.random.shuffle(sample_mask)
        
        scatter = go.Scatter(x = tsneModelVectors[mask,0][sample_mask],
                             y = tsneModelVectors[mask,1][sample_mask],
                             name = 'Topic {}: {}'.format(topic, topWords[topic]),
                             mode = 'markers',
                             text = df[mask]['Tweet_Text'][sample_mask],
                             marker = dict(color = colors[topic]))
        data.append(scatter)
    
    layout = go.Layout(title = 't-SNE Clustering of {} LDA Topics'.format(nTopics),
                       showlegend=True,
                       hovermode = 'closest')
    
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)
    

    6 PCA

    # 创建  PCA 模型
    pcaModel = PCA(n_components=nTopics, random_state=0)
    pcaTopicMatrix = pcaModel.fit_transform(vectorizedText.toarray())
    print('Shape PCA Topic Matrix: {}'.format(pcaTopicMatrix.shape))
    

    Shape PCA Topic Matrix: (7375, 8)

    # 获取最可能的键和所有类别的计数
    pcaKeys = pcaTopicMatrix.argmax(axis=1)
    pcaCategories, pcaCounts = zip(*Counter(pcaKeys).items())
    
    # 获取关键词数量
    topWords = getTopWords(5, pcaKeys, vectorizedText, countVectorizer)
    
    # 打印主题及其关键词
    for i, words in enumerate(topWords):
        print('Topic {}: {}'.format(i, words))
    

    Topic 0: 10% great, 2% america, 2% make, 2% thank, 0% people
    Topic 1: 11% thank, 3% trump2016, 1% support, 1% nice, 0% new
    Topic 2: 6% hillary, 4% clinton, 2% crooked, 0% bad, 0% bernie
    Topic 3: 3% poll, 3% new, 1% cruz, 1% big, 1% debate
    Topic 4: 1% america, 0% join, 0% jeb, 0% president, 0% make
    Topic 5: 8% trump2016, 1% tomorrow, 1% join, 0% job, 0% danscavino
    Topic 6: 2% people, 1% get, 1% like, 0% vote, 0% time
    Topic 7: 2% foxnews, 2% cnn, 1% tonight, 1% enjoy, 1% interviewed

    # 排序
    pcaCategoriesSorted, pcaCountsSorted = zip(*sorted(zip(pcaCategories, pcaCounts)))
    
    # 创建标签
    topWords = getTopWords(5, pcaKeys, vectorizedText, countVectorizer)
    labels = ['Topic {}'.format(i) for i in pcaCategoriesSorted]
    
    # 创建色彩映射
    n = nTopics
    cmap = get_cmap('viridis')
    colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]
    
    # 生成柱状图
    data = go.Bar(x = labels,
                  y = pcaCountsSorted,
                  text = [word for word in topWords if word],
                  marker = dict(color = colors))
    
    layout = go.Layout(title = 'Most Frequent PCA Topics In Trump Tweet',
                       xaxis = dict(title = 'Topic'),
                       yaxis = dict(title = 'Count'))
    
    fig = go.Figure(data=[data], layout=layout)
    iplot(fig)
    
    # 降维高维数据集,使其在 2D 中可视化
    tsneModel = TSNE(n_components=2, perplexity=50, learning_rate=100, n_iter=2000, verbose=1, random_state=0, angle=0.75)
    tsneModelVectors = tsneModel.fit_transform(pcaTopicMatrix)
    

    [t-SNE] Computing 151 nearest neighbors...
    [t-SNE] Indexed 7375 samples in 0.010s...
    [t-SNE] Computed neighbors for 7375 samples in 0.681s...
    [t-SNE] Computed conditional probabilities for sample 1000 / 7375
    [t-SNE] Computed conditional probabilities for sample 2000 / 7375
    [t-SNE] Computed conditional probabilities for sample 3000 / 7375
    [t-SNE] Computed conditional probabilities for sample 4000 / 7375
    [t-SNE] Computed conditional probabilities for sample 5000 / 7375
    [t-SNE] Computed conditional probabilities for sample 6000 / 7375
    [t-SNE] Computed conditional probabilities for sample 7000 / 7375
    [t-SNE] Computed conditional probabilities for sample 7375 / 7375
    [t-SNE] Mean sigma: 0.018791
    [t-SNE] KL divergence after 250 iterations with early exaggeration: 65.240501
    [t-SNE] Error after 2000 iterations: 0.823488

    # 创建色彩映射
    n = nTopics
    cmap = get_cmap('tab10')
    colors = [rgb2hex(cmap(color)) for color in np.arange(0, 1.000001, 1/(n-1))]
    
    # 获取关键词
    topWords = getTopWords(3, pcaKeys, vectorizedText, countVectorizer)
    
    
    # 生成散点图
    data = []
    # 迭代每个主题
    for topic in range(nTopics):
        mask = pcaKeys==topic
        sample_mask = np.zeros(mask.sum()).astype(bool)
        sample_mask[:int(1000/nTopics)] = True
        np.random.shuffle(sample_mask)
        
        scatter = go.Scatter(x = tsneModelVectors[mask,0][sample_mask],
                             y = tsneModelVectors[mask,1][sample_mask],
                             name = 'Topic {}: {}'.format(topic, topWords[topic]),
                             mode = 'markers',
                             text = df[mask]['Tweet_Text'][sample_mask],
                             marker = dict(color = colors[topic]))
        data.append(scatter)
    
    layout = go.Layout(title = 't-SNE Clustering of {} PCA Topics'.format(nTopics),
                       showlegend=True,
                       hovermode = 'closest')
    
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)
    

    相关文章

      网友评论

        本文标题:LSI 和 LDA 提取川普推特主题

        本文链接:https://www.haomeiwen.com/subject/qydzmftx.html