获取数据
使用助教提供的代码获取B站五个小分区的数据
存入数据库
使用批量插入的insert_many
方法,将json型数据批量插入MongoDB数据库
对比更新数据库
按照要求方法,对比更新数据库,使用insert_one
和update_one
等函数。
代码部分
"""
一个简单的🌰,主要关于协程网络爬虫与邮件发送
务必使用py3.6及以上版本运行此文件
代码包含的内容:
- 协程爬取bilibili生活区5个小分区的热榜
- 爬取完成后发送邮件通知
请勿在作业中直接使用本代码,因为其存在以下已知问题:
- 不保证对热榜爬取的顺序
- 对于视频描述和标题中存在\n的情况没有处理,这将导致结果数据文件无法直接使用
- 邮件为私人邮箱,且有发信限制(每日上限450封,每秒上限200封),邮箱会在2022-01-01关闭STMP发信许可,请及时更换自己邮箱。
- 其他未知问题
"""
"""
名称 代号 tid 简介 url路由
生活(主分区) life 160 /v/life
搞笑 funny 138 各种沙雕有趣的搞笑剪辑,挑战,表演,配音等视频 /v/life/funny
家居房产 home 239 与买房、装修、居家生活相关的分享 /v/life/home
手工 handmake 161 手工制品的制作过程或成品展示、教程、测评类视频 /v/life/handmake
绘画 painting 162 绘画过程或绘画教程,以及绘画相关的所有视频 /v/life/painting
日常 daily 21 记录日常生活,分享生活故事 /v/life/daily
"""
import os
import sys
import time
import glob
import asyncio
import aiohttp
import random
import requests as rs
from pymongo import MongoClient
from email.header import Header
from email.mime.text import MIMEText
from smtplib import SMTP_SSL
from datetime import datetime
SAVE_PATH = './'
BASE_URL = 'https://s.search.bilibili.com/cate/search'
CATE_LIST = [138,239,161,162,21]
sleep_choice = [1,2,3,4,5]
# Email Configer
SMTP_SERVER = 'smtp.feishu.cn'
PORT = 465
FROM_ADDR = 'send.test@eightlab.club'
PASSWD = 'mUuscUgDNHvOtuQg'
TO_ADDR = 'receive.test@eightlab.club'
class BiliHotCrawler():
def __init__(self,cate,limit=300):
self.cate_id = cate
self.limit = limit
self.page = 1
self.pagesize = 100
self.bulid_param()
self.url = BASE_URL
def bulid_param(self):
self.params = {
'main_ver':'v3',
'search_type':'video',
'view_type':'hot_rank',
'order':'click',
'cate_id':self.cate_id,
'page':self.page,
'pagesize':self.pagesize,
'time_from':20211110,
'time_to':20211117
}
async def get_resp(self):
time.sleep(random.choice(sleep_choice))
self.bulid_param()
async with aiohttp.ClientSession() as session:
async with session.get(url=self.url,params=self.params) as response:
self.resp = await response.read()
self.page += 1
def save_resp(self):
path = f"{SAVE_PATH}/{self.cate_id}.csv"
with open(path,"a",encoding='utf-8') as f:
self.resp = eval(self.resp.decode('utf-8').replace('null','None').replace('false',"False").replace("true","True").replace("\n"," "))
for item in self.resp["result"]:
# 分类 排名 bv号 时长 播放量 弹幕 标题 封面 评论 收藏 描述 直链
f.write(f'{self.cate_id},{item["rank_offset"]},{item["bvid"]},{item["duration"]},{item["play"]},{item["video_review"]},{item["title"]},{item["pic"]},{item["review"]},{item["favorites"]},{item["description"]},{item["arcurl"]}'.replace("\n"," ").encode('utf-8', 'replace').decode('utf-8'))
f.write("\n")
def log(self,isend=False):
print(f"\rCrawlering {self.cate_id} {(self.page - 1)*self.pagesize}/{self.limit}",end="")
if isend:
print()
async def start(self):
while self.pagesize * self.page <= self.limit:
self.log()
await self.get_resp()
self.save_resp()
self.log(isend=True)
class BHCFactory():
def __init__(self,cate_list):
self.cate_list = cate_list
def produce(self): # 相当于开了多个协程去分别爬取每个板块的热榜
loop = asyncio.get_event_loop()
tasks = [BiliHotCrawler(i).start() for i in self.cate_list]
loop.run_until_complete(asyncio.wait(tasks))
class BiliNoticeMail:
def __new__(cls, *args, **kwargs):
if not hasattr(BiliNoticeMail, '_instance'):
BiliNoticeMail._instance = object.__new__(cls, *args, **kwargs)
return BiliNoticeMail._instance
def __init__(self):
self.server = SMTP_SSL(SMTP_SERVER, PORT)
self.message = None
pass
def build_ready_message(self):
self.message = MIMEText('下载完成', 'plain', 'utf-8')
self.message['From'] = Header(FROM_ADDR, 'utf-8')
self.message['To'] = Header(TO_ADDR, 'utf-8')
self.message['Subject'] = Header('BiliReminder:Ready', 'utf-8')
def notice(self):
if self.message is None:
raise ValueError('Message is none!')
self.server.login(FROM_ADDR, PASSWD)
self.server.sendmail(FROM_ADDR, [TO_ADDR], self.message.as_string())
self.message = None
self.server.quit()
@classmethod
def send_ready_mail(cls):
noticer = cls()
noticer.build_ready_message()
noticer.notice()
print("发信成功,请查收。")
return noticer
def Defaultdatabase(): #默认在一周前插入的数据库
key = ['cate_id','rank_offset','bvid','duration','play','video_review','title','pic','review','favorites','description','arcurl']
texts=[]
file_lis = glob.glob('*.csv')
for lis in file_lis:
with open(lis,encoding='utf-8') as f:
for line in f:
value = line.strip().split(',')
dic = dict(zip(key,value))
now = datetime.now() # current date and time
dic['time'] = now.strftime("%Y-%M-%D,%H:%M:%S")
texts.append(dic)
print('load %d lines' % len(texts))
client=MongoClient('localhost',27017)
db = client.bilibili
collection=db.collections
result=collection.insert_many(texts) #一次性插入
print(result)
client.close()
def insert(): #在一周后对比数据并更新
client=MongoClient('localhost',27017)
db = client.bilibili
mycol = db.collections
key = ['cate_id','rank_offset','bvid','duration','play','video_review','title','pic','review','favorites','description','arcurl']
file_lis = glob.glob('*.csv')
for lis in file_lis: #将本地数据打开并查询,按条件更新
with open(lis,encoding='utf-8') as f:
for line in f:
value = line.strip().split(',')
dic = dict(zip(key,value))
now = datetime.now() # current date and time
dic['time'] = now.strftime("%Y-%M-%D,%H:%M:%S")
if find({'bvid':dic['bvid']}) == None:
mycol.insert_one(dic)
else:
#print(find({'bvid':dic['bvid']}))
mycol.update_one(find({'bvid':dic['bvid']}),{'$set':dic})
def find(exam_dic): #查找符合某个条件的数据是否存在
with MongoClient('localhost',27017) as client:
db=client.bilibili
collection=db.collections
re=collection.find_one()
re=collection.find_one(exam_dic)
#print(re)
return re
def main():
BHCFactory(CATE_LIST).produce()
BiliNoticeMail.send_ready_mail()
Defaultdatabase()
insert()
if __name__ == '__main__':
main()
示例数据
{'_id': ObjectId('61d7ae21e5ee63bf735963ba'), 'cate_id': '138', 'rank_offset': '25', 'bvid': 'BV1CU4y1M7TY', 'duration': '67', 'play': '3609867', 'video_review': '3607', 'title': '吱 辅 导', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/2486e99de186567d1ccb720bcec784d4f991685e.jpg', 'review': '3449', 'favorites': '25365', 'description': '这可能是自己都看不下去第二遍的玩意
儿吧……', 'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av676718837', 'time': '2022-28-01/07/22,11:28:54'}
{'_id': ObjectId('61d7ae21e5ee63bf735963bb'), 'cate_id': '138', 'rank_offset': '26', 'bvid': 'BV1WL4y1v7TX', 'duration': '102', 'play': '3531214', 'video_review': '1243', 'title': '双 十 一 购 物 图 鉴', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/a9d9d25495eece9909222e3436bb92141984ac93.jpg', 'review': '9890', 'favorites': '13417', 'description': '今年双11你买什么啦?',
'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av849099902', 'time': '2022-28-01/07/22,11:28:54'}
{'_id': ObjectId('61d7ae21e5ee63bf735963bc'), 'cate_id': '138', 'rank_offset': '27', 'bvid': 'BV1d44y1e7U3', 'duration': '160', 'play': '3484794', 'video_review': '2388', 'title': '当你考试遇到不会做的题时', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/0ea631ceb2ebb84ffd9695a2673f33413d83934b.jpg', 'review': '1634', 'favorites': '37372', 'description': '我破房了,你呢', 'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av976669354', 'time': '2022-28-01/07/22,11:28:54'}
{'_id': ObjectId('61d7ae21e5ee63bf735963bd'), 'cate_id': '138', 'rank_offset': '28', 'bvid': 'BV18b4y1t7BR', 'duration': '178', 'play': '3438752', 'video_review': '3784', 'title': '主 播 直 播 pk 有 多 拼', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/22c32c4def623396d8ca210da94946faa048c812.jpg', 'review': '3373', 'favorites': '47719', 'description': '微博@你别说还真是这
样', 'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av634189272', 'time': '2022-28-01/07/22,11:28:54'}
{'_id': ObjectId('61d7ae21e5ee63bf735963be'), 'cate_id': '138', 'rank_offset': '29', 'bvid': 'BV1QQ4y1m7Lc', 'duration': '94', 'play': '3413101', 'video_review': '2816', 'title': '小伙伴们你的人生你来主宰 不要听别人劝!绝对不行太痛苦了!我还在继续哭
!', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/24c557dd8cf60d39e9b2e6d35940d5a21331e168.jpg', 'review': '10387', 'favorites': '23799', 'description': '-', 'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av719156420', 'time': '2022-28-01/07/22,11:28:54'}
{'_id': ObjectId('61d7ae21e5ee63bf735963bf'), 'cate_id': '138', 'rank_offset': '30', 'bvid': 'BV1e34y1Z7pU', 'duration': '160', 'play': '3358074', 'video_review': '1776', 'title': '有没有人装逼正好撞到你擅长的领域上?', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/91524e9e19d5586889b97bac42ff93eac7215c69.jpg', 'review': '10388', 'favorites': '11954', 'description': '-', 'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av806587972', 'time': '2022-28-01/07/22,11:28:54'}
{'_id': ObjectId('61d7ae21e5ee63bf735963c0'), 'cate_id': '138', 'rank_offset': '31', 'bvid': 'BV1jb4y1t7Xk', 'duration': '243', 'play': '3307671', 'video_review': '134', 'title': '满级生物行为大赏!红眼之“社交天花板”,这气势真的很嚣张', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/e2caac4a2055c530f982f3edb58add0e06f8a38e.jpg', 'review': '39', 'favorites': '13552', 'description': '-', 'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av634150292', 'time': '2022-28-01/07/22,11:28:54'}
{'_id': ObjectId('61d7ae21e5ee63bf735963c1'), 'cate_id': '138', 'rank_offset': '32', 'bvid': 'BV1n44y1e7wr', 'duration': '98', 'play': '3306048', 'video_review': '5528', 'title': '南极贱畜是什么梗【梗指南】', 'pic': '\\/\\/i0.hdslb.com\\/bfs\\/archive\\/d235ef7a1194305bb2241b7924592f62cedfaa5b.jpg', 'review': '9735', 'favorites': '38119', 'description': '南极贱畜是什么梗
*本视频仅作为了解该梗的辅助,不能仅靠该视频作为对该梗的全面了解,真正深入了解可以观看原出处以及更多方面~', 'arcurl': 'http:\\/\\/www.bilibili.com\\/video\\/av976548175', 'time': '2022-28-01/07/22,11:28:54'}
网友评论