利用线程池进行视频抓取
网页源码li标签结构上图是网页的源码分析,在首页上ul标签下面有4个li标签,每个li标签下是有包含视频网页的地址,因此先取li标签,如:li_list=tree.xpath('//ul[@id="listvideoListUl"]/li')
以上是li标签结构,所以视频播放页地址这样取得 srcurl="https://www.pearvideo.com/"+li.xpath('./div/a/@href')[0]
def get_data(dic)是调用的函数。
mport requests
from lxml import etree
from multiprocessing.dummy import Pool
import re
#梨视频体育网址
url="https://www.pearvideo.com/category_9"
# 设置user-agent用字典的形式
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE"}
#session=requests.Session()
res=requests.get(url=url,headers=headers).text
#print(res)
tree=etree.HTML(res)
li_list=tree.xpath('//ul[@id="listvideoListUl"]/li')
# print(li_list)
urls=[]
for li in li_list:
srcurl="https://www.pearvideo.com/"+li.xpath('./div/a/@href')[0]
name=li.xpath('./div/a/div[2]/text()')[0]
name=name.replace(' ','')+'.mp4'
#print(name)
detail_page=requests.get(url=srcurl,headers=headers).text
ex='srcUrl="(.*?)",vdoUrl' # 用正则是因为视频地址取自javascrip中
#srcUrl="https://video.pearvideo.com/mp4/adshort/20190518/cont-1555912-13920965_adpkg-ad_hd.mp4",vdoUrl......
url=re.findall(ex,detail_page)[0]
dic={"url":url,"name":name}
urls.append(dic)
def get_data(dic):
url=dic["url"]
# print(url)
print("正在下载:",dic['name'])
data=requests.get(url=url,headers=headers).content
with open(dic['name'],'wb') as fp:
fp.write(data)
print("下载完成:", dic['name'])
pool=Pool(4)#建立有4个线程的线程池
pool.map(get_data,urls)#启动线程
pool.close()#关闭线程
pool.join()#主进程要等待所有线程进行完,再关闭。
网友评论