美文网首页
python爬取某一视频网站视频

python爬取某一视频网站视频

作者: 刘年 | 来源:发表于2020-03-27 14:25 被阅读0次

    关键有ajax分析
    灵活运用正则和xpath来获取数据
    文件命名时替换掉不规范字符

    from urllib import parse
    import requests
    from lxml import etree
    import re
    import time
    #通过解析ajx分析数据来源,由start控制刷新出来的内容
    #得到视频所在网页
    headers= {
        'Referer': 'https://www.pearvideo.com/category_8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3754.400 QQBrowser/10.5.4034.400'
    }
    base_url ='https://www.pearvideo.com/'
    def list_url(n):
        data = {
            'reqType': 5,
            'categoryId': 8,
            'start': 12*n
        }
        result = parse.urlencode(data)
        url='https://www.pearvideo.com/category_loading.jsp?{0}'.format(result)
        # print(url)
        list_page = requests.get(url,headers=headers).text
        html = etree.HTML(list_page)
        href = html.xpath('//div[@class="vervideo-bd"]/a/@href')
        href =[base_url+x for x in href]
        # print(href)
        return href
    def get_video_page(url):
        dict ={}
        vedio_page = requests.get(url, headers=headers).text
        vedio_html = etree.HTML(vedio_page)
        # vedio_scr = vedio_html.xpath('//div[@class="img prism-player play"]/video/@scr')
        vedio_title = vedio_html.xpath('//h1[@class="video-tt"]/text()')[0]
        vedio_title =re.sub(r'[\/\\\:\*\?\"\|\<\>]','',vedio_title)
        reg='srcUrl="(.*?)",vdoUrl='
        vedio_scr =re.findall(reg,vedio_page)[0]
        # print(vedio_scr,vedio_title)
        dict['title'] =vedio_title
        dict['scr'] =vedio_scr
        return dict
    def get_vedio(url):
        vedio_sourse = requests.get(url, headers=headers).content
    
    if __name__ == '__main__':
        for x in range(10):
            list_urls=list_url(x)
            for listurl in list_urls:
                dict=get_video_page(listurl)
                print(dict)
                vedio_sourse = requests.get(dict['scr'], headers=headers).content
                with open('梨视频下载\\{0}.mp4'.format(dict['title']),'wb') as ff:
                    ff.write(vedio_sourse)
                    print('梨视频下载\\{0}.mp4下载完成'.format(dict['title']))
                    time.sleep(1)
    
    

    相关文章

      网友评论

          本文标题:python爬取某一视频网站视频

          本文链接:https://www.haomeiwen.com/subject/bapluhtx.html