关键有ajax分析
灵活运用正则和xpath来获取数据
文件命名时替换掉不规范字符
from urllib import parse
import requests
from lxml import etree
import re
import time
#通过解析ajx分析数据来源,由start控制刷新出来的内容
#得到视频所在网页
headers= {
'Referer': 'https://www.pearvideo.com/category_8',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3754.400 QQBrowser/10.5.4034.400'
}
base_url ='https://www.pearvideo.com/'
def list_url(n):
data = {
'reqType': 5,
'categoryId': 8,
'start': 12*n
}
result = parse.urlencode(data)
url='https://www.pearvideo.com/category_loading.jsp?{0}'.format(result)
# print(url)
list_page = requests.get(url,headers=headers).text
html = etree.HTML(list_page)
href = html.xpath('//div[@class="vervideo-bd"]/a/@href')
href =[base_url+x for x in href]
# print(href)
return href
def get_video_page(url):
dict ={}
vedio_page = requests.get(url, headers=headers).text
vedio_html = etree.HTML(vedio_page)
# vedio_scr = vedio_html.xpath('//div[@class="img prism-player play"]/video/@scr')
vedio_title = vedio_html.xpath('//h1[@class="video-tt"]/text()')[0]
vedio_title =re.sub(r'[\/\\\:\*\?\"\|\<\>]','',vedio_title)
reg='srcUrl="(.*?)",vdoUrl='
vedio_scr =re.findall(reg,vedio_page)[0]
# print(vedio_scr,vedio_title)
dict['title'] =vedio_title
dict['scr'] =vedio_scr
return dict
def get_vedio(url):
vedio_sourse = requests.get(url, headers=headers).content
if __name__ == '__main__':
for x in range(10):
list_urls=list_url(x)
for listurl in list_urls:
dict=get_video_page(listurl)
print(dict)
vedio_sourse = requests.get(dict['scr'], headers=headers).content
with open('梨视频下载\\{0}.mp4'.format(dict['title']),'wb') as ff:
ff.write(vedio_sourse)
print('梨视频下载\\{0}.mp4下载完成'.format(dict['title']))
time.sleep(1)
网友评论