爬取B站弹幕并生成词云

作者: no_ones | 来源:发表于2018-08-09 17:05 被阅读0次

爬取B站弹幕并生成词云
网络爬虫爬取b站励志弹幕并生成词云(精心笔记总结)！
b站弹幕爬取
萌新学习Python爬取B站弹幕+R语言分词demo说明
Python爬虫实战(6)-爬取QQ空间好友说说并生成词云(超详
用Python做一个漂亮小姐姐词云跳舞视频
python爬取B站up主全部视频封面
python3——爬取B站弹幕
原来爬取b站弹幕这么简单!
堪称史上最强的代码采集B站全部视频！python 帮你

网上看到的爬取教程接口大都失效了，这次自己整一下，就当学习笔记了

自己在寻找弹幕的时候耗了很长时间，老想在视频上找到弹幕的加载地址……

其实弹幕就在右边

1.png

其实好多实现还是利用原来的
代码如下：

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import requests
import jieba
from pyquery import PyQuery as pq
from urllib.parse import urlencode
import datetime


def get_html(url):
    try:
        headers = {
            'Cookie': 'b LIVE_BUVID__ckMd5=7776ad817b9e0091; bp_t_offset_328350021=150073248314016020; _dfcaptcha=29276d4b1897beac8fcc8bb55f8ecdce',
            'Host': 'api.bilibili.com',
            'Origin': 'https://www.bilibili.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            response.encoding = response.apparent_encoding
            return response.content
        else:
            return None
    except:
        print("Connet_Error")

def get_text(html):
    doc = pq(html)
    items = doc('i d').items()
    for item in items:
        yield item.text()

def create_date(datestart = None,dateend = None):
    # 创建日期表

    if datestart is None:
        datestart = '2018-01-01'
    if dateend is None:
        dateend = datetime.datetime.now().strftime('%Y-%m-%d')

    # 转为日期格式
    datestart=datetime.datetime.strptime(datestart,'%Y-%m-%d') #字符串格式转化为日期格式的函数
    dateend=datetime.datetime.strptime(dateend,'%Y-%m-%d')
    date_list = []
    date_list.append(datestart.strftime('%Y-%m-%d'))
    while datestart<dateend:
        # 日期叠加一天
        datestart+=datetime.timedelta(days=+1)
        # 日期转字符串存入列表
        date_list.append(datestart.strftime('%Y-%m-%d'))
    return date_list

def save_to_file(content):
    with open('1.txt', 'a', encoding='utf-8') as f: #编码方式一定要选
        f.write(content + '\n')
        f.close()

def wordcloud(all_comments):
    # 对句子进行分词，加载停用词
    # 打开和保存文件时记得加encoding='utf-8'编码，不然会报错。
    def seg_sentence(sentence):
        sentence_seged = jieba.cut(sentence.strip(), cut_all=False)  # 精确模式
        stopwords = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8').readlines()]  # 这里加载停用词的路径
        outstr = ''
        for word in sentence_seged:
            if word not in stopwords:
                if word != '\t':
                    outstr += word
                    outstr += " "
        return outstr
    for line in all_comments:
        line_seg = seg_sentence(line)  # 这里的返回值是字符串
        with open('outputs.txt', 'a', encoding='utf-8') as f:
            f.write(line_seg + '\n')

    data = open('outputs.txt', 'r', encoding='utf-8').read()
    my_wordcloud = WordCloud(
        background_color='white',  #设置背景颜色
        max_words=200,  #设置最大实现的字数
        font_path=r'SimHei.ttf',  #设置字体格式，如不设置显示不了中文
    ).generate(data)
    plt.figure()
    plt.imshow(my_wordcloud)
    plt.axis('off')
    plt.show()  # 展示词云

def main():
    base_url = "https://api.bilibili.com/x/v2/dm/history?"
    date_list = create_date("2018-08-06")#设置开始时间，生成时间列表
    for day in date_list:
        params = {
            'type': '1',
            'oid': '23347802',
            'date': day
        }
        params = urlencode(params)
        url = base_url + params
        print(url)
        html = get_html(url)
        for item in get_text(html):
            save_to_file(item)
    f = open(r"E:\parser\b站弹幕\1.txt", 'r', encoding='utf-8')
    lines = f.readlines()
    wordcloud(lines)
    f.close()
            


if __name__ == "__main__":
    main()

先把弹幕内容存进 txt 文件里，之后再读取，快些？
结果如下：

Figure_1.png