首先我们利用开发者工具进行抓包分析:
页面抓包.png通过开发者工具发现左侧是这次加载页面动态加载的信息,可以看到这个链接是GET方式请求。然后可以试着去分析及查看他们的响应内容
分析.png
对json进行美化,可以基本断定该信息为美拍的接口地址,下面会对json进行分析处理
{
"medias":[
{
"id":1224285358,
"client_id":1089857302,
"caption":"当老爸变成了灯神.....",
"entry_info":null,
"weibo_share_caption":"#美拍#当老爸变成了灯神.....",
"facebook_share_caption":"",
"weixin_share_caption":"分享@祝晓晗🌻 的美拍",
"weixin_friendfeed_share_caption":"当老爸变成了灯神.....",
"qzone_share_caption":"当老爸变成了灯神.....",
"qq_share_caption":"当老爸变成了灯神.....",
"instagram_share_caption":"分享祝晓晗🌻的美拍“当老爸变成了灯神.....”,快来看看!",
"weixin_share_sub_caption":"来跟我一起玩美拍吧~",
"weixin_friendfeed_share_sub_caption":"来跟我一起玩美拍吧~",
"qzone_share_sub_caption":"来跟我一起玩美拍吧~",
"qq_share_sub_caption":"来跟我一起玩美拍吧~",
"geo":null,
"video":"0c02aHR0cHM635RLy9tdnZpZGVvMTEubWVpdHVkYXRhLmNvbS81ZjI1NzY0MmE1MmJkMTV1bThhOGFwODk0MV9IMjY0XzRfMjJlNTFmNTM0MGRm2mSmMjcubXA0",
"url":"[https://www.meipai.com/media/1224285358?client_id=1089857306&utm_media_id=1224285358&utm_source=meipai_share&gid=](https://www.meipai.com/media/1224285358?client_id=1089857306&utm_media_id=1224285358&utm_source=meipai_share&gid=)",
"cover_pic":"[https://mvimg11.meitudata.com/5f257642c12ec6a1eqs8673689.jpg](https://mvimg11.meitudata.com/5f257642c12ec6a1eqs8673689.jpg)",
"pic_size":"720*1280",
"category":3,
"time":49,
"is_long":true,
"show_controls":false,
"created_at":"08-01 22:05",
"comments_count":114,
"likes_count":3100,
"reposts_count":3,
"user":Object{...},
"cur_lives_id":"",
"cur_lives_type":0,
"cur_lives_stream_type":0,
"cur_lives_scheme":"",
"cur_yy_actid":"",
"feed_id":"6856016371960585390",
"locked":false,
"type":5,
"caption_url_params":[
],
"privacy_config":Object{...},
"has_watermark":0,
"refuse_gift":true,
"refuse_gift_reason":"送礼功能已下线",
"hide_gift_btn":true,
"new_music":Object{...},
"convert_cover_pic":"",
"ar_magic_info":null,
"aside_info":null,
"convert_pic_size":"",
"cover_pic_resize":null,
"m_plan":false,
"ad_level":"N",
"first_frame_pic":"[http://mvimg10.meitudata.com/5f257aaf2bc6c3332.jpg](http://mvimg10.meitudata.com/5f257aaf2bc6c3332.jpg)",
"first_frame_pic_size":"720*1280",
"is_safe":1,
"is_safe2":1,
"miniprogram_data":Array[0],
"cover_title":"",
"dangerous_action":false,
"category_tag_id":0,
"recommend_caption":null,
"recommend_cover_pic":null,
"recommend_cover_pic_size":null,
"user_recommend_cover_pic":null,
"user_recommend_cover_pic_size":null,
"is_prefer":0,
"is_ad":0,
"left_bottom_tip":Array[0],
"display_source":125536,
"trace_id":"ke0yujgv-30xvb4-4chv",
"item_info":"{"id":1224285358,"code":125536,"codeDetail":"125536,125541","trace_id":"ke0yujgv-30xvb4-4chv"}",
"caption_origin":"当老爸变成了灯神.....",
"campaign":"",
"created_at_origin":1596290704,
"caption_complete":"当老爸变成了灯神.....",
"caption_all":"当老爸变成了灯神....."
},
Object{...},
Object{...},
Object{...},
Object{...},
Object{...},
Object{...},
Object{...},
Object{...}
],
"total":1000,
"current_page":2
}
使用找到的接口地址:
def main(self,i):
#抓包获取到的美拍接口地址
url = "https://www.meipai.com/squares/new_timeline?page={i}&count=24&tid=13".format(i=i)
print(url)
response = self.get_req(url)
if response:
try:
self.get_video(response)
except Exception as e:
print('获取视频出错,错误代码:',e)
解析抓包文件,获取视频信息
def get_video(self,response):
reqs = json.loads(response)
reqs = reqs['medias']
for req in reqs:
videoname = req['caption']
if videoname:
video_name = videoname
else:
video_name = req['weibo_share_caption']
video_name = video_name.replace(' ', '')
video_name = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", video_name) # 剔除不合法字符
print(video_name)
video_url = req['video']
try:
videourl = self.video_decode(video_url).decode('utf8') # 解密视频地址
print(videourl)
try:
self.download(video_name, videourl)
except Exception as e:
print('视频下载出错,错误代码:',e)
with open(r'meipai/spider.txt', 'a+', encoding='utf-8') as f:
f.write('视频下载出错,错误代码:{e}---采集{videourl}|{video_name}内容失败\n'.format(e=e,video_url=video_url,video_name=video_name))
pass
except Exception as e:
print(r'视频地址解密出错,错误代码:',e)
with open(r'meipai/spider.txt', 'a+', encoding='utf-8') as f:
f.write('视频解密出错,错误代码:{e}---采集{video_url}|{video_name}内容失败\n'.format(e=e,video_url=video_url,video_name=video_name))
解析视频地址信息
def video_decode(self,encoded_string):
def getHex(param1):
return {
'str': param1[4:],
'hex': ''.join(list(param1[:4])[::-1]),
}
def getDec(param1):
loc2 = str(int(param1, 16))
return {
'pre': list(loc2[:2]),
'tail': list(loc2[2:]),
}
def substr(param1, param2):
loc3 = param1[0: int(param2[0])]
loc4 = param1[int(param2[0]): int(param2[0]) + int(param2[1])]
return loc3 + param1[int(param2[0]):].replace(loc4, "")
def getPos(param1, param2):
param2[0] = len(param1) - int(param2[0]) - int(param2[1])
return param2
dict2 = getHex(encoded_string)
dict3 = getDec(dict2['hex'])
str4 = substr(dict2['str'], dict3['pre'])
return base64.b64decode(substr(str4, getPos(str4, dict3['tail'])))
下载视频,附带进度显示
def download(self,name,videourl):
print("准备下载!")
file_path = 'meipai/{name}.mp4'.format(name=name)
with closing(requests.get(videourl,stream=True)) as response:
chunk_size = 1024 # 单次请求最大值
print(response.status_code)
content_size = int(response.headers['content-length']) # 内容体总大小
print(content_size)
data_count = 0
with open(file_path, "wb") as file:
for data in response.iter_content(chunk_size=chunk_size):
file.write(data)
data_count = data_count + len(data)
now_jd = (data_count / content_size) * 100
print("\r 文件下载进度:%d%%(%d/%d) - %s" % (now_jd, data_count, content_size, file_path), end=" ")
print("\n>>> 获取视频成功了!")
time.sleep(2)
完整代码
import requests
from fake_useragent import UserAgent
import base64
import json
import re
import time
from contextlib import closing
import threading
class MP():
# 解密视频地址
def video_decode(self,encoded_string):
def getHex(param1):
return {
'str': param1[4:],
'hex': ''.join(list(param1[:4])[::-1]),
}
def getDec(param1):
loc2 = str(int(param1, 16))
return {
'pre': list(loc2[:2]),
'tail': list(loc2[2:]),
}
def substr(param1, param2):
loc3 = param1[0: int(param2[0])]
loc4 = param1[int(param2[0]): int(param2[0]) + int(param2[1])]
return loc3 + param1[int(param2[0]):].replace(loc4, "")
def getPos(param1, param2):
param2[0] = len(param1) - int(param2[0]) - int(param2[1])
return param2
dict2 = getHex(encoded_string)
dict3 = getDec(dict2['hex'])
str4 = substr(dict2['str'], dict3['pre'])
return base64.b64decode(substr(str4, getPos(str4, dict3['tail'])))
# 请求头
def ua(self):
ua = UserAgent()
headers = {
'Cookie': 'MUSID=kdhd1o131g536r6shisl5rdcg7; MP_WEB_GID=266934702254632; virtual_device_id=433ced9ee7d2b137b89ae37d40df50e9; pvid=UdlW5diAfeJPUaHLK1j3vMC7xVBOnB9c; sid=kdhd1o131g536r6shisl5rdcg7; UM_distinctid=174006923bd1a2-0ad676043abc8a-581b3318-1fa400-174006923beff; CNZZDATA1256786412=1978692496-1597731334-%7C1597731334',
'Host': 'www.meipai.com',
'Referer': 'https://www.meipai.com/square/13',
#'User-Agent': ua.random,
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
return headers
# 访问网页
def get_req(self,url):
response = requests.get(url, headers=self.ua())
if response.status_code == 200:
response = response.content.decode('utf-8')
else:
response = None
return response
#解析抓包文件
def get_video(self,response):
reqs = json.loads(response)
reqs = reqs['medias']
for req in reqs:
videoname = req['caption']
if videoname:
video_name = videoname
else:
video_name = req['weibo_share_caption']
video_name = video_name.replace(' ', '')
video_name = re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", video_name) # 剔除不合法字符
print(video_name)
video_url = req['video']
try:
videourl = self.video_decode(video_url).decode('utf8') # 解密视频地址
print(videourl)
try:
self.download(video_name, videourl)
except Exception as e:
print('视频下载出错,错误代码:',e)
with open(r'meipai/spider.txt', 'a+', encoding='utf-8') as f:
f.write('视频下载出错,错误代码:{e}---采集{videourl}|{video_name}内容失败\n'.format(e=e,video_url=video_url,video_name=video_name))
pass
except Exception as e:
print(r'视频地址解密出错,错误代码:',e)
with open(r'meipai/spider.txt', 'a+', encoding='utf-8') as f:
f.write('视频解密出错,错误代码:{e}---采集{video_url}|{video_name}内容失败\n'.format(e=e,video_url=video_url,video_name=video_name))
pass
def download(self,name,videourl):
print("准备下载!")
file_path = 'meipai/{name}.mp4'.format(name=name)
with closing(requests.get(videourl, proxies={'https': 'https://127.0.0.1:1080', 'http': 'http://127.0.0.1:1080'},stream=True)) as response:
chunk_size = 1024 # 单次请求最大值
print(response.status_code)
content_size = int(response.headers['content-length']) # 内容体总大小
print(content_size)
data_count = 0
with open(file_path, "wb") as file:
for data in response.iter_content(chunk_size=chunk_size):
file.write(data)
data_count = data_count + len(data)
now_jd = (data_count / content_size) * 100
print("\r 文件下载进度:%d%%(%d/%d) - %s" % (now_jd, data_count, content_size, file_path), end=" ")
print("\n>>> 获取视频成功了!")
time.sleep(2)
def main(self,i):
#抓包获取到的美拍接口地址
url = "https://www.meipai.com/squares/new_timeline?page={i}&count=24&tid=13".format(i=i)
print(url)
response = self.get_req(url)
if response:
try:
self.get_video(response)
except Exception as e:
print('获取视频出错,错误代码:',e)
if __name__=="__main__":
video_download = MP()
#video_download.main(1)
for i in range(100):
t1 = threading.Thread(target=video_download.main,kwargs={"i":i})
t1.start()
网友评论