爬取哔哩哔哩视频信息

一、获取视频信息来源

在chrome浏览器打开bilibili网站，点击动画分类，选择MAD.AMV类，打开chrom浏览器调试模式，使用全局搜索来搜索一条视频信息，发现视频信息来自于newlist?....这个链接，显然这是动态请求的js链接。

1.png

查看这个js应答的详细内容，发现我们需要的视频信息全部都包含在这个js应答中，我们就可以用python的json模块提取出需要的内容。

2.png

二、编写代码

json内容比较容易分析，不需要正则和xpath来抓取内容。

使用消息队列和多线程来抓取信息，保存为一个json文件，代码如下：

import requests
import json
from queue import Queue
import threading


class BilibiliSpider(object):

    def __init__(self):
        self.url_list = []
        self.base_url = 'https://api.bilibili.com/x/web-interface/newlist?&rid=24&type=0&pn={}&ps=20'
        self.url_list = [self.base_url.format(i) for i in range(1, 20)]
        # 创建一个url请求队列,队列中添加请求后的响应内容
        self.url_queue = Queue()
        # 创建一个content_queue队列，队列中添加分析后的视频信息
        self.content_queue = Queue()

    def request_url(self):
        for url in self.url_list:
            self.url_queue.put(requests.get(url).content.decode())

    def parse_josn(self):
        while True:
            json_str = self.url_queue.get()
            res = json.loads(json_str, encoding='utf8')['data']['archives']
            for temp in res:
                # print(temp)
                video_info = dict()
                # print(json)
                # 视频名称
                video_info['title'] = temp['title']
                # 视频网址
                video_info['video_url'] = 'https://www.bilibili.com/video/av' + str(temp['aid'])
                # 视频封面地址
                video_info['video_cover_url'] = temp['pic']
                # 视频类型
                video_info['video_type'] = temp['tname']
                # 视频类型id
                video_info['video_type_id'] = temp['tid']
                # 视频描述
                video_info['video_desc'] = temp['desc']
                # 弹幕条数
                video_info['danmu_count'] = temp['stat']['danmaku']
                # 喜欢人数
                video_info['favorite'] = temp['stat']['favorite']
                # 视频发布者
                video_info['video_owner'] = temp['owner']['name']
                # 发布者id号
                video_info['video_owner_id'] = temp['owner']['mid']
                # 发布者头像网址
                video_info['video_face_cover_url'] = temp['owner']['face']
                self.content_queue.put(video_info)
            self.url_queue.task_done()

    def save_content(self):
        while True:
            content = self.content_queue.get()
            with open('video_info.json', 'a', encoding='utf8') as f:
                f.write(json.dumps(content, ensure_ascii=False))
                f.write('\n')
            print('写入成功')
            self.content_queue.task_done()

    def run(self):
        # 发起请求
        self.request_url()
        threading_list = []
        # 创建10个线程来解析json文件
        for i in range(10):
            t_pj = threading.Thread(target=self.parse_josn)
            threading_list.append(t_pj)

        # 创建1个线程来保存文件
        t_sj = threading.Thread(target=self.save_content)
        threading_list.append(t_sj)

        for thread in threading_list:
            # 主线程结束子线程就结束
            thread.setDaemon(True)
            thread.start()

        for queue in [self.url_queue, self.content_queue]:
            # 主线程一直阻塞到所有队列为空
            queue.join()


if __name__ == '__main__':
    bibilispider = BilibiliSpider()
    bibilispider.run()