美文网首页
13. 异步爬虫

13. 异步爬虫

作者: 薛东弗斯 | 来源:发表于2024-03-01 21:50 被阅读0次

    1. 串行执行

    import requests
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
    }
    urls = [
        'https://downsc.chinaz.net/Files/DownLoad/jianli/202306/zjianli1756.rar',
        'https://downsc.chinaz.net/Files/DownLoad/jianli/202306/zjianli1756.rar',
        'https://downsc.chinaz.net/Files/DownLoad/jianli/202306/zjianli1756.rar'
    ]
    
    def get_content(url):
        print('正在爬取:',url)
        #get方法是一个阻塞的方法
        response = requests.get(url=url,headers=headers)
        if response.status_code == 200 :
            return response.content
    
    def parse_content(content):
        print('响应数据的长度为:',len(content))
    
    
    for url in urls:
        content = get_content(url)
        parse_content(content)
    

    4个线程串行执行,共花8秒

    import time
    # 单线程串行方式执行
    
    def get_page(str):
        print("downloading...", str)
        time.sleep(2)
        print("download succeed:", str)
    
    name_list = ['aa', 'bb', 'cc', 'dd']
    start_time = time.time()
    for i in range(len(name_list)):
        get_page(name_list[i])
    end_time = time.time()
    print("%d seconds passed!" % (end_time - start_time))
    

    线程池,执行时间为2秒

    import time
    # 导入线程池模块对应的类
    from multiprocessing.dummy import Pool
    
    start_time = time.time()
    def get_page(str):
        print("downloading...", str)
        time.sleep(2)
        print("download succeed:", str)
    
    name_list = ['aa', 'bb', 'cc', 'dd']
    pool = Pool(4)
    # 将列表中每个列表元素传递给get_page进行处理
    pool.map(get_page,name_list)
    end_time = time.time()
    print("%d seconds passed!" % (end_time - start_time))
    

    li 视频

    import requests
    from lxml import etree
    import re
    from multiprocessing.dummy import Pool
    #需求:爬取梨视频的视频数据
    headers = {
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    #原则:线程池处理的是阻塞且较为耗时的操作,不期望能处理所有的操作
    
    #对下述url发起请求解析出视频详情页的url和视频的名称
    url = 'https://www.pearvideo.com/category_5'
    page_text = requests.get(url=url,headers=headers).text
    
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
    urls = [] #存储所有视频的链接and名字
    for li in li_list:
        detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
        name = li.xpath('./div/a/div[2]/text()')[0]+'.mp4'
        #对详情页的url发起请求
        detail_page_text = requests.get(url=detail_url,headers=headers).text
        #从详情页中解析出视频的地址(url)
        ex = 'srcUrl="(.*?)",vdoUrl'
        video_url = re.findall(ex,detail_page_text)[0]
        dic = {
            'name':name,
            'url':video_url
        }
        urls.append(dic)
    #对视频链接发起请求获取视频的二进制数据,然后将视频数据进行返回
    def get_video_data(dic):
        url = dic['url']
        print(dic['name'],'正在下载......')
        data = requests.get(url=url,headers=headers).content
        #持久化存储操作
        with open(dic['name'],'wb') as fp:
            fp.write(data)
            print(dic['name'],'下载成功!')
    #使用线程池对视频数据进行请求(较为耗时的阻塞操作)
    pool = Pool(4)
    pool.map(get_video_data,urls)
    
    pool.close()
    pool.join()
    

    相关文章

      网友评论

          本文标题:13. 异步爬虫

          本文链接:https://www.haomeiwen.com/subject/fnmfzdtx.html