前言:
微信视频号的加入,再度引燃了短视频领域,今天我们爬取B站的小视频,其他类型的视频可以参考这个方法
视频页面.png网页分析:
网址为:https://vc.bilibili.com/p/eden/hot#/?tag=
进行抓包分析,我们可以看到 Request URL这个属性值,我们向下滑动加载视频的过程中,发现只有这段url是不变的。next_offset会一直变化,我们可以猜测,这个可能就是获取下一个视频序号,我们只需要把这部分参数提取出来,把next_offset写成变量值,用JSON格式返回到目标网页即可
视频下载方法
上一部分对网页进行了分析,现在我们可以利用request.get方法来获取B站上的小视频
全部代码:
import requests
import json
from contextlib import closing
import time
#之前想使用多线程提高效率,此代码中未使用多线程
import threading
import queue
class bili():
def __init__(self,url,next_offset,video_queue):
self.url = url
self.next_offset = next_offset
self.video_queue = video_queue
def ua(self):
headers = {
"origin": "https://vc.bilibili.com",
"referer": "https://vc.bilibili.com/p/eden/hot",
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
return headers
def get_req(self,url):
response = requests.get(url,headers=self.ua())
if response.status_code == 200:
response = response.content.decode('utf-8')
#response = response.text
else:
response = None
return response
def mp4_download(self,video_queue):
video_desc =video_queue.get()
video_name = video_desc[0]
video_url = video_desc[1]
print("准备下载!")
file_path = 'B站小视频/{name}.mp4'.format(name=video_name)
# proxies={'https': 'https://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'},
with closing(requests.get(video_url, headers=self.ua(), stream=True)) as response:
chunk_size = 1024 # 单次请求最大值
print(response.status_code)
content_size = int(response.headers['content-length']) # 内容体总大小
print(content_size)
data_count = 0
with open(file_path, "wb") as file:
for data in response.iter_content(chunk_size=chunk_size):
file.write(data)
data_count = data_count + len(data)
now_jd = (data_count / content_size) * 100
print("\r 文件下载进度:%d%%(%d/%d) - %s" % (now_jd, data_count, content_size, file_path), end=" ")
print("\n>>> 获取视频成功了!")
time.sleep(2)
def get_video(self,res):
res = res["data"]["items"]
for i in res:
video_name = i["item"]["description"]
video_name = video_name.replace(" ", "")
video_url = i["item"]["video_playurl"]
self.video_queue.put((video_name,video_url))
print(video_name)
print(video_url)
try:
self.mp4_download(video_queue)
except Exception as e:
print("视频下载出错:", e)
with open(r"B站小视频/log.txt", 'a+', encoding='utf-8') as f:
f.write("视频下载出错,错误代码:{error},采集视频:{video_url}|{video_name}内容".format(error=e, video_url=video_url,
video_name=video_name))
def run(self):
while True:
url = self.url.format(next_offset=self.next_offset)
res = self.get_req(url)
if res:
res = json.loads(res)
try:
self.next_offset = res["data"]["next_offset"]
except:
print("可能无next_offset")
return ""
try:
self.get_video(res)
except:
print("视频获取出错...")
if __name__ == '__main__':
video_queue = queue.Queue()
next_offset = ""
# 抓包获取到的url
url = "https://api.vc.bilibili.com/clip/v1/video/search?page_size=30&next_offset={next_offset}&tag=&need_playurl=1&order=new&platform=pc"
new_video = bili(url,next_offset,video_queue)
new_video.run()
网友评论