美文网首页Python+人工智能
Python-线程池爬取任务

Python-线程池爬取任务

作者: 杰伊_约翰 | 来源:发表于2019-02-24 16:38 被阅读0次

    1.多任务线程池进行爬取数据,高效率完成
    2.创建线程池将url和执行的函数名提交也就是pool.submit(函数名,url)
    3.使用xpath进行数据的获取
    4.存储数据为json文件,存在本地

    from concurrent.futures import ThreadPoolExecutor
    import requests
    from requests import exceptions
    from lxml import etree
    import json
    import threading

    导入线程池模块

    def crawlPageDate(url,kwargs):
    print(url,
    kwargs)

    # fullurl = 'https://www.meishij.net/chufang/diy/jiangchangcaipu/?&page=1' + str(pagenum)
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
    }
    
    try:
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            print('请求成功')
            html =response.text
            #将获取的页面源代码放入数据队列
    
            return html,200
    
    
    
    except exceptions.HTTPError as err:
        print(err)
    except exceptions.ConnectTimeout as err:
        print(err)
    except exceptions.RequestException as err:
        print(err)
    
    return None,404
    

    def done(futures):
    # print('123')
    print(futures)

    html,status = futures.result()
    print(status)
    # 解析数据,实例化一个xpath对象
    if html:
        x_html = etree.HTML(html)
        caipu_list = x_html.xpath('//div[@class="listtyle1"]')
        for cai_div in caipu_list:
            # 封面图
            item = {}
            item['coverImage'] = cai_div.xpath('//img[@class="img"]/@src')[0]
            item['type'] = cai_div.xpath('.//a/strong[@class="gx"]/span/text()')
    
            if len(item['type']) > 0:
                item['type'] = item['type'][0]
            else:
                item['type'] = '暂无'
    
            item['title'] = cai_div.xpath('.//div[@class="c1"]/strong/text()')[0]
            print(item)
    
            lock.acquire()
            with open('cai.json', 'a') as file:
                json_str = json.dumps(item, ensure_ascii=False) + '\n'
                file.write(json_str)
            lock.release()
    

    if name == 'main':
    #创建线程池
    pool = ThreadPoolExecutor(max_workers=1889999999999999999999999999)
    for page in range(1,57):
    #向线程池中提交任务
    '''
    fn:要执行的任务,args:要传递的参数,*kwargs要传递的多个参数:
    '''
    url = 'https://www.meishij.net/chufang/diy/jiangchangcaipu/?&page=' + str(page)
    result = pool.submit(crawlPageDate,url)
    #callback里面要添加方法名称
    result.add_done_callback(done)

    lock = threading.Lock()
    

    相关文章

      网友评论

        本文标题:Python-线程池爬取任务

        本文链接:https://www.haomeiwen.com/subject/pclbyqtx.html