美文网首页
爬取B站弹幕并生成词云

爬取B站弹幕并生成词云

作者: no_ones | 来源:发表于2018-08-09 17:05 被阅读0次

网上看到的爬取教程接口大都失效了,这次自己整一下,就当学习笔记了

自己在寻找弹幕的时候耗了很长时间,老想在视频上找到弹幕的加载地址……

其实弹幕就在右边


1.png

其实好多实现还是利用原来的
代码如下:

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import requests
import jieba
from pyquery import PyQuery as pq
from urllib.parse import urlencode
import datetime


def get_html(url):
    try:
        headers = {
            'Cookie': 'b LIVE_BUVID__ckMd5=7776ad817b9e0091; bp_t_offset_328350021=150073248314016020; _dfcaptcha=29276d4b1897beac8fcc8bb55f8ecdce',
            'Host': 'api.bilibili.com',
            'Origin': 'https://www.bilibili.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            response.encoding = response.apparent_encoding
            return response.content
        else:
            return None
    except:
        print("Connet_Error")

def get_text(html):
    doc = pq(html)
    items = doc('i d').items()
    for item in items:
        yield item.text()

def create_date(datestart = None,dateend = None):
    # 创建日期表

    if datestart is None:
        datestart = '2018-01-01'
    if dateend is None:
        dateend = datetime.datetime.now().strftime('%Y-%m-%d')

    # 转为日期格式
    datestart=datetime.datetime.strptime(datestart,'%Y-%m-%d') #字符串格式转化为日期格式的函数
    dateend=datetime.datetime.strptime(dateend,'%Y-%m-%d')
    date_list = []
    date_list.append(datestart.strftime('%Y-%m-%d'))
    while datestart<dateend:
        # 日期叠加一天
        datestart+=datetime.timedelta(days=+1)
        # 日期转字符串存入列表
        date_list.append(datestart.strftime('%Y-%m-%d'))
    return date_list

def save_to_file(content):
    with open('1.txt', 'a', encoding='utf-8') as f: #编码方式一定要选
        f.write(content + '\n')
        f.close()

def wordcloud(all_comments):
    # 对句子进行分词,加载停用词
    # 打开和保存文件时记得加encoding='utf-8'编码,不然会报错。
    def seg_sentence(sentence):
        sentence_seged = jieba.cut(sentence.strip(), cut_all=False)  # 精确模式
        stopwords = [line.strip() for line in open('stopwords.txt', 'r', encoding='utf-8').readlines()]  # 这里加载停用词的路径
        outstr = ''
        for word in sentence_seged:
            if word not in stopwords:
                if word != '\t':
                    outstr += word
                    outstr += " "
        return outstr
    for line in all_comments:
        line_seg = seg_sentence(line)  # 这里的返回值是字符串
        with open('outputs.txt', 'a', encoding='utf-8') as f:
            f.write(line_seg + '\n')

    data = open('outputs.txt', 'r', encoding='utf-8').read()
    my_wordcloud = WordCloud(
        background_color='white',  #设置背景颜色
        max_words=200,  #设置最大实现的字数
        font_path=r'SimHei.ttf',  #设置字体格式,如不设置显示不了中文
    ).generate(data)
    plt.figure()
    plt.imshow(my_wordcloud)
    plt.axis('off')
    plt.show()  # 展示词云

def main():
    base_url = "https://api.bilibili.com/x/v2/dm/history?"
    date_list = create_date("2018-08-06")#设置开始时间,生成时间列表
    for day in date_list:
        params = {
            'type': '1',
            'oid': '23347802',
            'date': day
        }
        params = urlencode(params)
        url = base_url + params
        print(url)
        html = get_html(url)
        for item in get_text(html):
            save_to_file(item)
    f = open(r"E:\parser\b站弹幕\1.txt", 'r', encoding='utf-8')
    lines = f.readlines()
    wordcloud(lines)
    f.close()
            


if __name__ == "__main__":
    main()

先把弹幕内容存进 txt 文件里,之后再读取,快些?
结果如下:


Figure_1.png

相关文章

网友评论

      本文标题:爬取B站弹幕并生成词云

      本文链接:https://www.haomeiwen.com/subject/dkscbftx.html