美文网首页简友广场想法心理
python bilibili热度榜并存储到MongoDB数据库

python bilibili热度榜并存储到MongoDB数据库

作者: Cache_wood | 来源:发表于2022-02-27 10:48 被阅读0次

    获取数据

    使用助教提供的代码获取B站五个小分区的数据

    存入数据库

    使用批量插入的insert_many方法,将json型数据批量插入MongoDB数据库

    对比更新数据库

    按照要求方法,对比更新数据库,使用insert_oneupdate_one等函数。

    代码部分

    """
    一个简单的🌰,主要关于协程网络爬虫与邮件发送
    务必使用py3.6及以上版本运行此文件
    
    代码包含的内容:
    - 协程爬取bilibili生活区5个小分区的热榜
    - 爬取完成后发送邮件通知
    
    请勿在作业中直接使用本代码,因为其存在以下已知问题:
    - 不保证对热榜爬取的顺序
    - 对于视频描述和标题中存在\n的情况没有处理,这将导致结果数据文件无法直接使用
    - 邮件为私人邮箱,且有发信限制(每日上限450封,每秒上限200封),邮箱会在2022-01-01关闭STMP发信许可,请及时更换自己邮箱。
    - 其他未知问题
    """
    
    """
    名称  代号  tid 简介  url路由
    生活(主分区) life    160     /v/life
    搞笑  funny   138 各种沙雕有趣的搞笑剪辑,挑战,表演,配音等视频 /v/life/funny
    家居房产    home    239 与买房、装修、居家生活相关的分享    /v/life/home
    手工  handmake    161 手工制品的制作过程或成品展示、教程、测评类视频 /v/life/handmake
    绘画  painting    162 绘画过程或绘画教程,以及绘画相关的所有视频   /v/life/painting
    日常  daily   21  记录日常生活,分享生活故事   /v/life/daily
    """
    
    import os
    import sys
    import time
    import glob
    import asyncio
    import aiohttp
    import random
    import requests as rs
    from pymongo import MongoClient
    from email.header import Header
    from email.mime.text import MIMEText
    from smtplib import SMTP_SSL
    from datetime import datetime
    
    SAVE_PATH = './'
    BASE_URL = 'https://s.search.bilibili.com/cate/search'
    CATE_LIST = [138,239,161,162,21]
    sleep_choice = [1,2,3,4,5]
    
    # Email Configer
    SMTP_SERVER = 'smtp.feishu.cn'
    PORT = 465
    FROM_ADDR = 'send.test@eightlab.club'
    PASSWD = 'mUuscUgDNHvOtuQg'
    TO_ADDR = 'receive.test@eightlab.club'
    
    class BiliHotCrawler():
        def __init__(self,cate,limit=300):
            self.cate_id = cate
            self.limit = limit
            self.page = 1
            self.pagesize = 100
            self.bulid_param()
            self.url = BASE_URL
            
        def bulid_param(self):
            self.params = {
                'main_ver':'v3',
                'search_type':'video',
                'view_type':'hot_rank',
                'order':'click',
                'cate_id':self.cate_id,
                'page':self.page,
                'pagesize':self.pagesize,
                'time_from':20211110,
                'time_to':20211117
            }
    
        async def get_resp(self):
            time.sleep(random.choice(sleep_choice))
            self.bulid_param()
            async with aiohttp.ClientSession() as session:
                async with session.get(url=self.url,params=self.params) as response:
                    self.resp = await response.read()
            self.page += 1
    
        def save_resp(self):
            path = f"{SAVE_PATH}/{self.cate_id}.csv"
            with open(path,"a",encoding='utf-8') as f:
                self.resp = eval(self.resp.decode('utf-8').replace('null','None').replace('false',"False").replace("true","True").replace("\n"," "))
                for item in self.resp["result"]:
                    # 分类 排名 bv号 时长 播放量 弹幕 标题 封面 评论 收藏 描述 直链
                    f.write(f'{self.cate_id},{item["rank_offset"]},{item["bvid"]},{item["duration"]},{item["play"]},{item["video_review"]},{item["title"]},{item["pic"]},{item["review"]},{item["favorites"]},{item["description"]},{item["arcurl"]}'.replace("\n"," ").encode('utf-8', 'replace').decode('utf-8'))
                    f.write("\n")
    
        def log(self,isend=False):
            print(f"\rCrawlering {self.cate_id} {(self.page - 1)*self.pagesize}/{self.limit}",end="")
            if isend:
                print()
    
        async def start(self):
            while self.pagesize * self.page <= self.limit:
                self.log()
                await self.get_resp()
                self.save_resp()
            self.log(isend=True)
    
    
    class BHCFactory():
        def __init__(self,cate_list):
            self.cate_list = cate_list
    
        def produce(self): # 相当于开了多个协程去分别爬取每个板块的热榜
            loop = asyncio.get_event_loop()
            tasks = [BiliHotCrawler(i).start() for i in self.cate_list]
            loop.run_until_complete(asyncio.wait(tasks))
    
    
    class BiliNoticeMail:
        def __new__(cls, *args, **kwargs):
            if not hasattr(BiliNoticeMail, '_instance'):
                BiliNoticeMail._instance = object.__new__(cls, *args, **kwargs)
            return BiliNoticeMail._instance
    
        def __init__(self):
            self.server = SMTP_SSL(SMTP_SERVER, PORT)
            self.message = None
            pass
    
        def build_ready_message(self):
            self.message = MIMEText('下载完成', 'plain', 'utf-8')
            self.message['From'] = Header(FROM_ADDR, 'utf-8')
            self.message['To'] = Header(TO_ADDR, 'utf-8')
            self.message['Subject'] = Header('BiliReminder:Ready', 'utf-8')
    
        def notice(self):
            if self.message is None:
                raise ValueError('Message is none!')
            self.server.login(FROM_ADDR, PASSWD)
            self.server.sendmail(FROM_ADDR, [TO_ADDR], self.message.as_string())
            self.message = None
            self.server.quit()
    
        @classmethod
        def send_ready_mail(cls):
            noticer = cls()
            noticer.build_ready_message()
            noticer.notice()
            print("发信成功,请查收。")
            return noticer
    
    def Defaultdatabase():  #默认在一周前插入的数据库
        key = ['cate_id','rank_offset','bvid','duration','play','video_review','title','pic','review','favorites','description','arcurl']
        texts=[]
        file_lis = glob.glob('*.csv')
        for lis in file_lis:
            with open(lis,encoding='utf-8') as f:
                for line in f:
                    value = line.strip().split(',')
                    dic = dict(zip(key,value))
                    now = datetime.now() # current date and time
                    dic['time'] = now.strftime("%Y-%M-%D,%H:%M:%S")
                    texts.append(dic)
    
        print('load %d lines' % len(texts))
        client=MongoClient('localhost',27017)
        db = client.bilibili
        collection=db.collections
    
        result=collection.insert_many(texts)  #一次性插入
        print(result)
        client.close()
    
    def insert():  #在一周后对比数据并更新
        client=MongoClient('localhost',27017)
        db = client.bilibili
        mycol = db.collections
    
        key = ['cate_id','rank_offset','bvid','duration','play','video_review','title','pic','review','favorites','description','arcurl']
        file_lis = glob.glob('*.csv')
        for lis in file_lis:  #将本地数据打开并查询,按条件更新
            with open(lis,encoding='utf-8') as f:
                for line in f:
                    value = line.strip().split(',')
                    dic = dict(zip(key,value))
                    now = datetime.now() # current date and time
                    dic['time'] = now.strftime("%Y-%M-%D,%H:%M:%S")
    
                    if find({'bvid':dic['bvid']}) == None:
                        mycol.insert_one(dic)
                    else:
                        #print(find({'bvid':dic['bvid']}))
                        mycol.update_one(find({'bvid':dic['bvid']}),{'$set':dic})
    
    def find(exam_dic):  #查找符合某个条件的数据是否存在
        with MongoClient('localhost',27017) as client:
            db=client.bilibili
            collection=db.collections
    
            re=collection.find_one()
            re=collection.find_one(exam_dic)
        #print(re)
        return re
    
    def main():  
        BHCFactory(CATE_LIST).produce()
        BiliNoticeMail.send_ready_mail()
        Defaultdatabase()
        insert()
    
    if __name__ == '__main__':
        main()
    

    示例数据

    {'_id': ObjectId('61d7ae21e5ee63bf735963ba'), 'cate_id': '138', 'rank_offset': '25', 'bvid': 'BV1CU4y1M7TY', 'duration': '67', 'play': '3609867', 'video_review': '3607', 'title': '吱  辅  导', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/2486e99de186567d1ccb720bcec784d4f991685e.jpg', 'review': '3449', 'favorites': '25365', 'description': '这可能是自己都看不下去第二遍的玩意
    儿吧……', 'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av676718837', 'time': '2022-28-01/07/22,11:28:54'}
    {'_id': ObjectId('61d7ae21e5ee63bf735963bb'), 'cate_id': '138', 'rank_offset': '26', 'bvid': 'BV1WL4y1v7TX', 'duration': '102', 'play': '3531214', 'video_review': '1243', 'title': '双 十 一 购 物 图 鉴', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/a9d9d25495eece9909222e3436bb92141984ac93.jpg', 'review': '9890', 'favorites': '13417', 'description': '今年双11你买什么啦?', 
    'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av849099902', 'time': '2022-28-01/07/22,11:28:54'}
    {'_id': ObjectId('61d7ae21e5ee63bf735963bc'), 'cate_id': '138', 'rank_offset': '27', 'bvid': 'BV1d44y1e7U3', 'duration': '160', 'play': '3484794', 'video_review': '2388', 'title': '当你考试遇到不会做的题时', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/0ea631ceb2ebb84ffd9695a2673f33413d83934b.jpg', 'review': '1634', 'favorites': '37372', 'description': '我破房了,你呢', 'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av976669354', 'time': '2022-28-01/07/22,11:28:54'}
    {'_id': ObjectId('61d7ae21e5ee63bf735963bd'), 'cate_id': '138', 'rank_offset': '28', 'bvid': 'BV18b4y1t7BR', 'duration': '178', 'play': '3438752', 'video_review': '3784', 'title': '主 播 直 播 pk 有 多 拼', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/22c32c4def623396d8ca210da94946faa048c812.jpg', 'review': '3373', 'favorites': '47719', 'description': '微博@你别说还真是这 
    样', 'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av634189272', 'time': '2022-28-01/07/22,11:28:54'}
    {'_id': ObjectId('61d7ae21e5ee63bf735963be'), 'cate_id': '138', 'rank_offset': '29', 'bvid': 'BV1QQ4y1m7Lc', 'duration': '94', 'play': '3413101', 'video_review': '2816', 'title': '小伙伴们你的人生你来主宰 不要听别人劝!绝对不行太痛苦了!我还在继续哭 
    !', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/24c557dd8cf60d39e9b2e6d35940d5a21331e168.jpg', 'review': '10387', 'favorites': '23799', 'description': '-', 'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av719156420', 'time': '2022-28-01/07/22,11:28:54'}
    {'_id': ObjectId('61d7ae21e5ee63bf735963bf'), 'cate_id': '138', 'rank_offset': '30', 'bvid': 'BV1e34y1Z7pU', 'duration': '160', 'play': '3358074', 'video_review': '1776', 'title': '有没有人装逼正好撞到你擅长的领域上?', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/91524e9e19d5586889b97bac42ff93eac7215c69.jpg', 'review': '10388', 'favorites': '11954', 'description': '-', 'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av806587972', 'time': '2022-28-01/07/22,11:28:54'}
    {'_id': ObjectId('61d7ae21e5ee63bf735963c0'), 'cate_id': '138', 'rank_offset': '31', 'bvid': 'BV1jb4y1t7Xk', 'duration': '243', 'play': '3307671', 'video_review': '134', 'title': '满级生物行为大赏!红眼之“社交天花板”,这气势真的很嚣张', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/e2caac4a2055c530f982f3edb58add0e06f8a38e.jpg', 'review': '39', 'favorites': '13552', 'description': '-', 'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av634150292', 'time': '2022-28-01/07/22,11:28:54'}
    {'_id': ObjectId('61d7ae21e5ee63bf735963c1'), 'cate_id': '138', 'rank_offset': '32', 'bvid': 'BV1n44y1e7wr', 'duration': '98', 'play': '3306048', 'video_review': '5528', 'title': '南极贱畜是什么梗【梗指南】', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/d235ef7a1194305bb2241b7924592f62cedfaa5b.jpg', 'review': '9735', 'favorites': '38119', 'description': '南极贱畜是什么梗  
    *本视频仅作为了解该梗的辅助,不能仅靠该视频作为对该梗的全面了解,真正深入了解可以观看原出处以及更多方面~', 'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av976548175', 'time': '2022-28-01/07/22,11:28:54'}
    

    相关文章

      网友评论

        本文标题:python bilibili热度榜并存储到MongoDB数据库

        本文链接:https://www.haomeiwen.com/subject/bhjnlrtx.html