美文网首页
Python豆瓣影评爬虫及词云生成

Python豆瓣影评爬虫及词云生成

作者: LinoX | 来源:发表于2019-01-31 19:53 被阅读0次

    没错又来水博客

    和图书爬虫思路一致,加了点花里胡哨的东西

    直接上代码
    • 写入数据部分
    # 作者:Lino
    # 参考于作者:Charles
    
    import re
    import os
    import requests
    from bs4 import BeautifulSoup
    import bs4
    import xlwt
    import time
    import pickle
    
    
    # 简化版豆瓣影评获取器
    # 生成词云
    # 影评及作者.xls保存于当前目录
    # 暂不拥有模拟登录功能
    
    
    def get_page(url):
        headers = {
            'Uesr-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                          '(KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
        }
        try:
            res = requests.get(url, headers=headers)
            res.raise_for_status()
            return res.text
        except:
            return ""
    
    
    def fillCommentsDatas(data, html):
        soup = BeautifulSoup(html, 'lxml')
        divs = soup.find_all('div', attrs={'class': 'comment-item'})
        for div in divs:
            if isinstance(div, bs4.element.Tag):
                nickname = div.find('a', attrs={'title': True}).get('title')
                StarandDate = div.find_all('span', attrs={'title': True})
                if len(StarandDate) == 2:
                    star = float(re.findall('allstar(\d\d).*?', str(StarandDate[0]))[0]) / 10
                    date = StarandDate[1].get('title')
                else:
                    star = "无"
                    date = StarandDate[0].get('title')
                comment = div.find('span', attrs={'class': 'short'}).string.strip()
                data[nickname] = [date, star, comment]
    
    
    def save_to_pkl(savepath, data):
        f = open(os.path.join(savepath, "影评.pkl"), 'wb')
        pickle.dump(data, f)
        f.close()
    
    
    def write_to_excel(data):
        workbook = xlwt.Workbook(encoding='ascii')
        worksheet = workbook.add_sheet('BookSheet')
        worksheet.col(0).width = 4000
        worksheet.col(1).width = 3000
        worksheet.col(2).width = 8000
        worksheet.col(3).width = 30000
        style = xlwt.XFStyle()
        font = xlwt.Font()
        font.name = "宋体"
        font.height = 11 * 20
        alignment = xlwt.Alignment()
        alignment.horz = xlwt.Alignment.HORZ_CENTER
        alignment.vert = xlwt.Alignment.VERT_CENTER
        alignment.wrap = 1
        style.font = font
        style.alignment = alignment
        worksheet.write(0, 0, "昵称", style)
        worksheet.write(0, 1, "评分", style)
        worksheet.write(0, 2, "日期", style)
        worksheet.write(0, 3, "评论", style)
        tmp = 1
        for key, value in data.items():
            worksheet.write(tmp, 0, key, style)
            worksheet.write(tmp, 1, value[1], style)
            worksheet.write(tmp, 2, value[0], style)
            worksheet.write(tmp, 3, value[2], style)
            tmp += 1
    
        workbook.save('影评.xls')
    
    
    if __name__ == '__main__':
        data = {}
        mid = input("输入电影的代号:")
        for i in range(20):
            url = "https://movie.douban.com/subject/" + str(mid) + "/comments?start=" + str(i*20) + "&limit=20&sort=new_score&status=P"
            html = get_page(url)
            fillCommentsDatas(data, html)
            save_to_pkl(os.getcwd(), data)
            write_to_excel(data)
            time.sleep(1)
    
    • 生成词云部分
    from wordcloud import WordCloud
    import pickle
    import os
    import jieba
    
    def generateWordCloud(words, savepath):
        wc = WordCloud(font_path='simkai.ttf', background_color='white', max_words=2000, width=1920, height=1080, margin=5)
        wc.generate_from_frequencies(words)
        wc.to_file(os.path.join(savepath, 'commentscloud.jpg'))
    
    
    def frequencies(texts, stopwords):
        words_dict = {}
        for text in texts:
            temp = jieba.cut(text)
            for t in temp:
                if t in stopwords:
                    continue
                if t in words_dict.keys():
                    words_dict[t] += 1
                else:
                    words_dict[t] = 1
        return words_dict
    
    
    if __name__ == '__main__':
        f = open('影评.pkl', 'rb')
        data = pickle.load(f)
        f.close()
        texts = [d[1][2] for d in data.items()]
        stopwords = open('stopwords.txt', 'r', encoding='utf-8').read().split('\n')[:-1]
        words_dict = frequencies(texts, stopwords)
        generateWordCloud(words_dict, os.getcwd())
    

    效果

    Excel 词云

    相关文章

      网友评论

          本文标题:Python豆瓣影评爬虫及词云生成

          本文链接:https://www.haomeiwen.com/subject/znelsqtx.html