美文网首页
使用多进程抓取网页信息

使用多进程抓取网页信息

作者: BlueCat2016 | 来源:发表于2018-07-08 17:43 被阅读0次

    使用multiprocessting的多进程爬虫

    # coding:utf8
    
    import time
    from multiprocessing import Process, Queue
    import requests
    
    link_list = []
    with open('alexa.txt', 'r') as f:
        file_list = f.readlines()
        for line in file_list:
            link = line.split('\t')[1]
            link = link.replace('\n', '')
            link_list.append(link)
    
    # print(link_list)
    start = time.time()
    
    
    class MyProcess(Process):
        def __init__(self, name, q):
            Process.__init__(self)
            self.name = name
            self.q = q
    
        def run(self):
            print('Starting', self.pid)
            print('Process_name:%s' % self.name)
            while not self.q.empty():
                crawl(self.q)
            print('Exiting', self.pid)
    
    
    def crawl(q):
        link = q.get(timeout=2)
        try:
            r = requests.get(link)
            print(r.status_code)
        except Exception as e:
            print(link, 'Error: ', e)
    
    
    if __name__ == '__main__':
        # process_names = ['Process_1', 'Process_2', 'Process_3']
        list_size = len(link_list)
        workQueue = Queue(list_size)
    
        # 填充队列
        for url in link_list:
            workQueue.put(url)
    
        for i in range(0, 3):
            p = MyProcess('Process_%s' % str(i), workQueue)
            # p.daemon = True
            p.start()
            # p.join()
    
        end = time.time()
        print('duration: %d' % (end - start))
    
    

    使用Pool + Queue的多进程爬虫

    # coding:utf8
    
    import time
    import multiprocessing
    from multiprocessing import Pool, Manager
    import requests
    
    link_list = []
    with open('alexa.txt', 'r') as f:
        file_list = f.readlines()
        for line in file_list:
            link = line.split('\t')[1]
            link = link.replace('\n', '')
            link_list.append(link)
    
    # print(link_list)
    start = time.time()
    
    
    def crawl(q, index):
        process_id = 'Process_%s' % str(index)
        while not q.empty():
            link = q.get(timeout=2)
            try:
                r = requests.get(link)
                print(process_id, r.status_code)
            except Exception as e:
                print(process_id, link, 'Error: ', e)
    
    
    if __name__ == '__main__':
        manager = Manager()
        list_size = len(link_list)
        workQueue = manager.Queue(list_size)
    
        # 填充队列
        for url in link_list:
            workQueue.put(url)
    
        pool_num = multiprocessing.cpu_count()
        pool = Pool(processes=pool_num)
        for i in range(pool_num):
            pool.apply_async(crawl, args=(workQueue, i))
    
        print('Started project...')
        pool.close()
        pool.join()
        end = time.time()
        print('duration: %d' % (end - start))
    

    相关文章

      网友评论

          本文标题:使用多进程抓取网页信息

          本文链接:https://www.haomeiwen.com/subject/ffwouftx.html