词云

作者: dingtom | 来源:发表于2020-02-19 15:28 被阅读0次
    import requests
    import pandas as pd
    import jieba
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    
    # 爬取数据
    for i in range(0, 200, 20):
        # 通过浏览器“检查”分析,得到URL数据接口。在不断往下刷新页面的过程中,发现URL中只有“start”参数不断产生变化,依次为0,20,40,60,80
        url = 'https://m.douban.com/rexxar/api/v2/gallery/topic/125573/items?' \
              'sort=new&start={}&count=20&status_full_text=1&guest_only=0&ck=null'.format(i)
        print(url)
        # 破解防爬虫,带上请求头
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36                              (KHTML, like Gecko) Chrome/76.0".3809.100 Safari/537.36',
                   'Referer': 'https://www.douban.com/gallery/topic/125573/?from=gallery_trend                            &sort=hot'}
        # 发送请求,获取响应
        reponse = requests.get(url, headers=headers)
        html = reponse.json()
        # 解析数据,获得短评
        # 保存到本地
        for j in range(19):
            abst = html['items'][j]['abstract']
            with open("want_after.txt", "a", encoding='utf-8') as f:
                f.write(abst)
                print(abst)
    
    # 获得wordcloud 需要的文本格式
    with open("want_after.txt", "r", encoding='utf-8') as f:
        words = ' '.join(jieba.cut(f.read(), cut_all=False))
    backgroud_Image = plt.imread(r'C:\Users\tomding\Videos\图片1.png')  # 背景图
    # 词云的一些参数设置
    wcloud = WordCloud(
          background_color='white',
          mask=backgroud_Image,
          font_path=r'c:\\windows\\Fonts\\simhei.ttf',
          max_words=200,
          max_font_size=200,
          min_font_size=8,
          random_state=50)
    
    # 生成词云
    word_cloud = wcloud.generate_from_text(words)
    plt.imshow(word_cloud)
    plt.axis('off')
    wcloud.to_file('结果.jpg')
    
    # 看看词频高的有哪些
    process_word = WordCloud.process_text(wcloud, words)
    words_sorted = sorted(process_word.items(), key=lambda i: i[1], reverse=True)
    sort_after = words_sorted[:50]
    print(sort_after)
    
    # 把数据存成csv文件
    df = pd.DataFrame(sort_after)
    # 保证不乱码
    df.to_csv('sort_after.csv', encoding='utf_8_sig')
    

    相关文章

      网友评论

          本文标题:词云

          本文链接:https://www.haomeiwen.com/subject/veuafhtx.html