美文网首页
爬虫多线程、串行问题test

爬虫多线程、串行问题test

作者: 采星星的小太阳BavaLI | 来源:发表于2020-05-06 16:09 被阅读0次
    import time
    import requests
    import concurrent
    from concurrent import futures
    import threading
    from multiprocessing import Pool
    import pandas as pd
    # 定义装饰器
    def get_time(fun):
        def wrapper(*args,**kwargs):
            print('*'*50)
            print(fun.__name__, 'start.........')
            # 开始时间
            start_time = time.time()
            # 运行函数
            fun(*args,**kwargs)
            # 结束时间
            end_time = time.time()
            spend_time = end_time - start_time
            print(fun.__name__, '函数结束了,花费了%ss的时间'% spend_time)
            print('*'*50)
        return wrapper
    # 定义装饰器
    
    # 从文件中取出地址
    def get_urls_from_file(n):
        df = pd.read_csv('TestUrls.csv')
        urls = list(df['url'][:n])
        return urls
    # 从文件中取出地址
    
    # 请求数据并分析数据
    def get_data(url, retries = 3):
        headers_value={'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
        try:
            html = requests.get(url, headers = headers_value)
            print('访问的网址:', url, ':访问的代码:', html.status_code)
        except requests.exceptions.ConnectionError as e:
            print('下载错误:',e)
            html = None
        if ((html != None) and (500<= html.status_code <=600) and retries):
            print('服务器错误!,正在重试!')
            # time.sleep(1)
            retries -=1
            get_data(url, retries)
            data = html.text
        else:
            data = html.text
        return data
    # 请求数据并分析数据
    
    # 串行
    @get_time
    def Mynormal(urls):
        for url in urls:
            get_data(url)
    # 串行
    
    #进程池
    def MyprocessPool(num=10):
        pool = Pool(num)
        results = pool.map(get_data, urls)
        pool.close()
        pool.join()
        return results
    
    #进程池
    
    #多线程
    @get_time
    def My_multi_thread(max_thread=10):
        def url_process():
            while True:
                try:
                    url = urls.pop()
                except IndexError :
                    break
                data = get_data(url, retries=3)
        threads =[]
        while int(len(threads)< max_thread) and len(urls):
            thread = threading.Thread(target=url_process())
            thread.start()
            threads.append(thread)
        for thread in threads:
            thread.join()
    
    #多线程
    
    #线程池
    @get_time
    def Myfuter(num_of_max=10):
         with concurrent.futures.ThreadPoolExecutor(max_workers=num_of_max) as executor:
            executor.map(get_data,urls)
    #线程池
    # 并行函数运用
    if __name__ == '__main__':
        # 数量越多,则越能体现并行优势
        urls = get_urls_from_file(10)
        Mynormal(urls)
        MyprocessPool(10)#进程池
        Myfuter(10)#线程池
        My_multi_thread(10)#多线程
    # 并行函数运用
    
    

    运行结果:


    image.png

    这里还使用了装饰器做了一个查看函数运行时间的问题,具体看代码。

    相关文章

      网友评论

          本文标题:爬虫多线程、串行问题test

          本文链接:https://www.haomeiwen.com/subject/wcuwghtx.html