爬虫多线程、串行问题test

作者: 采星星的小太阳BavaLI | 来源:发表于2020-05-06 16:09 被阅读0次

爬虫多线程、串行问题test
python多线程基础
多线程 GCD 异步同步与串行
iOS 多线程技术总结
多线程介绍
Python爬虫第七天：多线程爬虫|Scrapy框架
Python爬虫基础教程（三）
java任务并行、串行执行框架
330,GCD栅栏函数dispatch_barrier使用注意（
线程与进程

import time
import requests
import concurrent
from concurrent import futures
import threading
from multiprocessing import Pool
import pandas as pd
# 定义装饰器
def get_time(fun):
    def wrapper(*args,**kwargs):
        print('*'*50)
        print(fun.__name__, 'start.........')
        # 开始时间
        start_time = time.time()
        # 运行函数
        fun(*args,**kwargs)
        # 结束时间
        end_time = time.time()
        spend_time = end_time - start_time
        print(fun.__name__, '函数结束了，花费了%ss的时间'% spend_time)
        print('*'*50)
    return wrapper
# 定义装饰器

# 从文件中取出地址
def get_urls_from_file(n):
    df = pd.read_csv('TestUrls.csv')
    urls = list(df['url'][:n])
    return urls
# 从文件中取出地址

# 请求数据并分析数据
def get_data(url, retries = 3):
    headers_value={'User-Agent':"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
    try:
        html = requests.get(url, headers = headers_value)
        print('访问的网址：', url, '：访问的代码：', html.status_code)
    except requests.exceptions.ConnectionError as e:
        print('下载错误：',e)
        html = None
    if ((html != None) and (500<= html.status_code <=600) and retries):
        print('服务器错误！，正在重试！')
        # time.sleep(1)
        retries -=1
        get_data(url, retries)
        data = html.text
    else:
        data = html.text
    return data
# 请求数据并分析数据

# 串行
@get_time
def Mynormal(urls):
    for url in urls:
        get_data(url)
# 串行

#进程池
def MyprocessPool(num=10):
    pool = Pool(num)
    results = pool.map(get_data, urls)
    pool.close()
    pool.join()
    return results

#进程池

#多线程
@get_time
def My_multi_thread(max_thread=10):
    def url_process():
        while True:
            try:
                url = urls.pop()
            except IndexError :
                break
            data = get_data(url, retries=3)
    threads =[]
    while int(len(threads)< max_thread) and len(urls):
        thread = threading.Thread(target=url_process())
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()

#多线程

#线程池
@get_time
def Myfuter(num_of_max=10):
     with concurrent.futures.ThreadPoolExecutor(max_workers=num_of_max) as executor:
        executor.map(get_data,urls)
#线程池
# 并行函数运用
if __name__ == '__main__':
    # 数量越多，则越能体现并行优势
    urls = get_urls_from_file(10)
    Mynormal(urls)
    MyprocessPool(10)#进程池
    Myfuter(10)#线程池
    My_multi_thread(10)#多线程
# 并行函数运用

运行结果：