美文网首页
python爬虫系列1-沙沙野视频多线程

python爬虫系列1-沙沙野视频多线程

作者: livein80 | 来源:发表于2020-07-29 18:12 被阅读0次

    任务需求:

    # -*- coding: utf-8 -*-
    # @Time    : 2020/7/29 6:05 下午
    # @Author  : livein80
    # @Email   : 12985594@qq.com
    # @File    : ssyer.py
    # @Software : PyCharm
    import requests
    import os
    # 多进程下载
    from multiprocessing import Pool
    
    
    json_dir='./json_dir/'
    headers = {
        'Connection': 'keep-alive',
        'Pragma': 'no-cache',
        'Cache-Control': 'no-cache',
        'Accept': 'application/json',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
        'Content-Type': 'application/json',
        'Origin': 'https://www.ssyer.com',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Dest': 'empty',
        'Referer': 'https://www.ssyer.com/cate/2',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ko;q=0.7,und;q=0.6',
    }
    data = '{"cateId":2,"order":2,"recommendType":1,"page":{"showCount":20,"currentPage":1}}'
    session = requests.session()
    def get_data():
        cookies = {
            'UM_distinctid': '17398ee35fb8eb-0679366a6e0d54-31627304-232800-17398ee35fc744',
            'CNZZDATA1278764889': '613028250-1595997139-https%253A%252F%252Fwww.google.com%252F%7C1595997139',
            '_dg_playback.7b6028a56aac520d.ce42': '1',
            '_dg_abtestInfo.7b6028a56aac520d.ce42': '1',
            '_dg_check.7b6028a56aac520d.ce42': '1',
            '_dg_antiBotFlag.7b6028a56aac520d.ce42': '1',
            '_dg_antiBotInfo.7b6028a56aac520d.ce42': '10%7C%7C%7C3600',
            'SESSION': 'ZTg3OGVjMGUtZjA0Ni00NmVkLTg2MjctMTY0ZWJhODRmYTc2',
            'Hm_lvt_8f50334c83664955c1a1a866dd168053': '1595998616,1595998662',
            'Hm_lpvt_8f50334c83664955c1a1a866dd168053': '1595998662',
            '_dg_id.7b6028a56aac520d.ce42': 'fc0bc167b752f00b%7C%7C%7C1595998616%7C%7C%7C0%7C%7C%7C1595998662%7C%7C%7C1595998616%7C%7C%7C%7C%7C%7Ce809b4e64783781d%7C%7C%7Chttps%3A%2F%2Fwww.google.com%2F%7C%7C%7Chttps%3A%2F%2Fwww.google.com%2F%7C%7C%7C1%7C%7C%7Cundefined',
        }
        # +++++++++++++++++++++
        response = session.post('https://www.ssyer.com/apis/20001', headers=headers, cookies=cookies, data=data)
        return response
    
    # 开始下载视频
    def start_load_vid(vid_name,vid_url):
        res = requests.get(vid_url,verify=False).content
        if not os.path.isdir('./vid/'):
            os.mkdir('./vid/')
        with open('./vid/{}.mp4'.format(vid_name),'wb') as file:
            file.write(res)
        print('%s 视频下载完成'%vid_name)
    # 获取视频列表
    def get_vid_lis(list):
        vid_list = []
        for item in list:
            # start_load_vid(item['title'],item['zip'])
            vid_list.append({'name':item['title'],'link':item['zip']})
            # print("%s 视频下载完成" % item['title'])
        return vid_list
    
    # === 开始爬虫 ===
    def start_spider(callback):
        count=0
        def check():
            nonlocal count
            count += 1
            print('check-->', count)
            response = get_data()
            print(response.status_code,session.cookies)
            if response.status_code == 200:
                json_obj = response.json()
                callback(json_obj)
            else:
                # 更新 cookies
                json_obj=None
                if count < 5:
                    check()
                else:
                    print('cookies更新失败!')
            return json_obj
        return check()
    
    def start_download(json_obj):
        list = json_obj['data']
        # ===获取视频列表===
        vid_list = get_vid_lis(list)
        # 多线程下载
        pool = Pool(15)
        for obj in vid_list:
            pool.apply_async(start_load_vid, args=(obj['name'], obj['link']))
        # 关闭池
        pool.close()
        pool.join()
        print('所有视频现在完成!')
    
    if __name__=='__main__':
        # ===== 开始爬虫 =====
        start_spider(start_download)
    

    相关文章

      网友评论

          本文标题:python爬虫系列1-沙沙野视频多线程

          本文链接:https://www.haomeiwen.com/subject/bcjkrktx.html